hive/core/framework/host/colony_runtime.py

"""ColonyRuntime — Orchestrates a colony of parallel worker clones.

Each worker is an exact copy of the queen's AgentLoop — same tools,
same prompt, same LLM. Workers run independently and report results
back to the queen via the event bus.

The ColonyRuntime replaces both AgentHost and ExecutionManager.
There are no graphs, no edges, no nodes, no data buffers.
Just: spawn N independent clones, let them run, collect results.
"""

from __future__ import annotations

import asyncio
import json
import logging
import os
import time
from collections import OrderedDict
from collections.abc import Callable
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING, Any

from framework.agent_loop.types import AgentContext, AgentSpec
from framework.host.event_bus import AgentEvent, EventBus, EventType
from framework.host.triggers import TriggerDefinition
from framework.host.worker import Worker, WorkerInfo, WorkerResult
from framework.schemas.goal import Goal
from framework.storage.concurrent import ConcurrentStorage
from framework.storage.session_store import SessionStore

if TYPE_CHECKING:
    from framework.llm.provider import LLMProvider, Tool
    from framework.skills.manager import SkillsManagerConfig
    from framework.tracker.runtime_log_store import RuntimeLogStore

logger = logging.getLogger(__name__)


def _format_spawn_task_message(task: str, input_data: dict[str, Any]) -> str:
    """Render the spawn task into the worker's next user message.

    Spawned workers inherit the queen's conversation via
    ``ColonyRuntime._fork_parent_conversation``; this helper builds
    the content of the trailing user message that carries the new
    task. The queen's chat already provides the context for the
    task, so we frame this as an explicit hand-off.

    Additional keys from ``input_data`` (other than the task itself)
    are rendered below the hand-off line so the worker sees them as
    structured hand-off data. This mirrors the fresh-path
    ``AgentLoop._build_initial_message`` shape so worker prompts look
    roughly the same whether or not inheritance fired.
    """
    lines = [
        "# New task delegated by the queen",
        "",
        "The queen's conversation up to this point is visible above. "
        "Use it as context (who the user is, what was already decided, "
        "which skills apply). Your own system prompt and tool set are "
        "set by the framework — the queen's tools may differ from "
        "yours, so treat her prior tool calls as history only.",
        "",
        f"task: {task}",
    ]
    for key, value in (input_data or {}).items():
        if key in ("task", "user_request"):
            # Already rendered above; don't duplicate.
            continue
        if value is None:
            continue
        lines.append(f"{key}: {value}")
    return "\n".join(lines)


def _env_int(name: str, default: int) -> int:
    """Read a positive int from env; fall back to default on missing/invalid."""
    raw = os.environ.get(name)
    if not raw:
        return default
    try:
        value = int(raw)
    except ValueError:
        logger.warning("Invalid %s=%r; using default %d", name, raw, default)
        return default
    return value if value > 0 else default


# Laptop-safe default. Each worker is a full AgentLoop (Claude SDK session +
# tool catalog), so ~4 concurrent is the realistic ceiling on a dev machine.
# Override via HIVE_MAX_CONCURRENT_WORKERS for servers.
_DEFAULT_MAX_CONCURRENT_WORKERS = _env_int("HIVE_MAX_CONCURRENT_WORKERS", 4)


@dataclass
class ColonyConfig:
    max_concurrent_workers: int = _DEFAULT_MAX_CONCURRENT_WORKERS
    cache_ttl: float = 60.0
    batch_interval: float = 0.1
    max_history: int = 1000
    result_retention_max: int = 1000
    result_retention_ttl_seconds: float | None = None
    idempotency_ttl_seconds: float = 300.0
    idempotency_max_keys: int = 10000
    webhook_host: str = "127.0.0.1"
    webhook_port: int = 8080
    webhook_routes: list[dict] = field(default_factory=list)
    max_resurrections: int = 3


@dataclass
class TriggerSpec:
    """Specification for a trigger that auto-spawns workers."""

    id: str
    name: str
    trigger_type: str  # "webhook", "api", "timer", "event", "manual"
    trigger_config: dict[str, Any] = field(default_factory=dict)
    isolation_level: str = "shared"
    priority: int = 0
    max_concurrent: int = 10
    max_resurrections: int = 3


class StreamEventBus(EventBus):
    """Proxy that stamps ``colony_id`` on every published event."""

    def __init__(self, bus: EventBus, colony_id: str) -> None:
        self._real_bus = bus
        self._colony_id = colony_id
        self.last_activity_time: float = time.monotonic()

    async def publish(self, event: AgentEvent) -> None:
        event.colony_id = self._colony_id
        self.last_activity_time = time.monotonic()
        await self._real_bus.publish(event)

    def subscribe(self, *args: Any, **kwargs: Any) -> str:
        return self._real_bus.subscribe(*args, **kwargs)

    def unsubscribe(self, subscription_id: str) -> bool:
        return self._real_bus.unsubscribe(subscription_id)

    def get_history(self, *args: Any, **kwargs: Any) -> list:
        return self._real_bus.get_history(*args, **kwargs)

    def get_stats(self) -> dict:
        return self._real_bus.get_stats()

    async def wait_for(self, *args: Any, **kwargs: Any) -> Any:
        return await self._real_bus.wait_for(*args, **kwargs)


class ColonyRuntime:
    """Orchestrates a colony of parallel worker clones.

    Each worker is an exact copy of the queen's AgentLoop. Workers run
    independently, report results via the event bus, and terminate.

    Supports:
    - Spawning/stopping workers
    - Timer and webhook triggers that auto-spawn workers
    - Pipeline middleware (credentials, tools, skills)
    - Event pub/sub for queen-worker communication
    """

    def __init__(
        self,
        agent_spec: AgentSpec,
        goal: Goal,
        storage_path: str | Path,
        llm: LLMProvider | None = None,
        tools: list[Tool] | None = None,
        tool_executor: Callable | None = None,
        config: ColonyConfig | None = None,
        runtime_log_store: RuntimeLogStore | None = None,
        colony_id: str | None = None,
        accounts_prompt: str = "",
        accounts_data: list[dict] | None = None,
        tool_provider_map: dict[str, str] | None = None,
        event_bus: EventBus | None = None,
        skills_manager_config: SkillsManagerConfig | None = None,
        skills_catalog_prompt: str = "",
        protocols_prompt: str = "",
        skill_dirs: list[str] | None = None,
        pipeline_stages: list | None = None,
        queen_id: str | None = None,
        colony_name: str | None = None,
    ):
        from framework.pipeline.runner import PipelineRunner
        from framework.skills.manager import SkillsManager

        self._agent_spec = agent_spec
        self._goal = goal
        self._config = config or ColonyConfig()
        self._runtime_log_store = runtime_log_store
        self._queen_id: str | None = queen_id
        # ``colony_id`` is the event-bus scope (session.id in DM sessions);
        # ``colony_name`` is the on-disk identity under ~/.hive/colonies/.
        # They coincide for forked colonies but diverge for queen DM
        # sessions, so separate them explicitly.
        self._colony_name: str | None = colony_name

        if pipeline_stages:
            self._pipeline = PipelineRunner(pipeline_stages)
        else:
            self._pipeline = self._load_pipeline_from_config()

        # Resolve per-colony override paths so UI toggles can reach this
        # runtime. Callers that build their own SkillsManagerConfig stay
        # in charge; bare construction auto-wires the standard paths.
        _effective_cfg = skills_manager_config
        if _effective_cfg is None and not (skills_catalog_prompt or protocols_prompt):
            _effective_cfg = self._build_default_skills_config(colony_name, queen_id)

        if _effective_cfg is not None:
            self._skills_manager = SkillsManager(_effective_cfg)
            self._skills_manager.load()
        elif skills_catalog_prompt or protocols_prompt:
            import warnings

            warnings.warn(
                "Passing pre-rendered skills_catalog_prompt/protocols_prompt "
                "is deprecated. Pass skills_manager_config instead.",
                DeprecationWarning,
                stacklevel=2,
            )
            self._skills_manager = SkillsManager.from_precomputed(skills_catalog_prompt, protocols_prompt)
        else:
            self._skills_manager = SkillsManager()
            self._skills_manager.load()

        self.skill_dirs: list[str] = self._skills_manager.allowlisted_dirs
        self.context_warn_ratio: float | None = self._skills_manager.context_warn_ratio
        self.batch_init_nudge: str | None = self._skills_manager.batch_init_nudge

        self._colony_id: str = colony_id or "primary"

        # Ensure the colony task template exists. Idempotent — if the
        # colony was created previously, this is a no-op (it just stamps
        # last_seen_session_ids if a session id is provided later).
        try:
            import asyncio as _asyncio

            from framework.tasks import TaskListRole, get_task_store
            from framework.tasks.scoping import colony_task_list_id

            _store = get_task_store()
            _list_id = colony_task_list_id(self._colony_id)
            try:
                # Best-effort: schedule on the running loop, or do it inline
                # if no loop is yet running (e.g. during construction).
                _loop = _asyncio.get_running_loop()
                _loop.create_task(_store.ensure_task_list(_list_id, role=TaskListRole.TEMPLATE))
            except RuntimeError:
                _asyncio.run(_store.ensure_task_list(_list_id, role=TaskListRole.TEMPLATE))
        except Exception:
            logger.debug("Failed to ensure colony task template", exc_info=True)

        self._accounts_prompt = accounts_prompt
        self._accounts_data = accounts_data
        self._tool_provider_map = tool_provider_map
        self._dynamic_memory_provider_factory: Callable[[str], Callable[[], str] | None] | None = None

        storage_path_obj = Path(storage_path) if isinstance(storage_path, str) else storage_path
        self._storage_path: Path = storage_path_obj
        self._storage = ConcurrentStorage(
            base_path=storage_path_obj,
            cache_ttl=self._config.cache_ttl,
            batch_interval=self._config.batch_interval,
        )
        self._session_store = SessionStore(storage_path_obj)

        self._event_bus = event_bus or EventBus(max_history=self._config.max_history)
        self._scoped_event_bus = StreamEventBus(self._event_bus, self._colony_id)

        # Make the event bus visible to the task-system event emitters so
        # task lifecycle events fan out to the same bus the rest of the
        # system uses. Idempotent — last writer wins.
        try:
            from framework.tasks.events import set_default_event_bus

            set_default_event_bus(self._event_bus)
        except Exception:
            logger.debug("Failed to register default task event bus", exc_info=True)

        self._llm = llm
        self._tools = tools or []
        self._tool_executor = tool_executor

        # Per-colony MCP tool allowlist — applied when spawning workers. A
        # value of ``None`` means "allow every MCP tool" (default), an empty
        # list disables every MCP tool, and a list of names only enables
        # those. Lifecycle / synthetic tools always pass through the filter
        # because their names are absent from ``_mcp_tool_names_all``. The
        # allowlist is re-read on every ``spawn`` so a PATCH that mutates
        # this attribute via ``set_tool_allowlist`` takes effect on the
        # NEXT worker spawn without a runtime restart. In-flight workers
        # keep the tool list they booted with — workers have no dynamic
        # tools provider today.
        self._enabled_mcp_tools: list[str] | None = None
        self._mcp_tool_names_all: set[str] = set()

        # Worker management
        self._workers: dict[str, Worker] = {}
        # The persistent client-facing overseer (optional). Set by
        # ``start_overseer()`` at session start. In a DM session the
        # overseer is the queen chatting with the user with 0 parallel
        # workers. In a colony session she's the queen orchestrating N
        # parallel workers.
        self._overseer: Worker | None = None
        self._triggers: dict[str, TriggerSpec] = {}
        self._trigger_definitions: dict[str, TriggerDefinition] = {}

        # Timer/webhook infrastructure
        self._event_subscriptions: list[str] = []
        self._timer_tasks: list[asyncio.Task] = []
        self._timer_next_fire: dict[str, float] = {}
        self._webhook_server: Any = None
        # Background tasks owned by the runtime that aren't timers —
        # e.g. the per-spawn soft/hard timeout watchers kicked off by
        # run_parallel_workers. We hold strong references so asyncio
        # does not garbage-collect them mid-sleep (Python's asyncio
        # docs explicitly warn that create_task() needs a referenced
        # handle).
        self._background_tasks: set[asyncio.Task] = set()

        # Idempotency
        self._idempotency_keys: OrderedDict[str, str] = OrderedDict()
        self._idempotency_times: dict[str, float] = {}

        # User presence
        self._last_user_input_time: float = 0.0

        # Result retention
        self._execution_results: OrderedDict[str, WorkerResult] = OrderedDict()
        self._execution_result_times: dict[str, float] = {}

        self._running = False
        self._timers_paused = False
        self._lock = asyncio.Lock()

        self.intro_message: str = ""

    @property
    def skills_catalog_prompt(self) -> str:
        return self._skills_manager.skills_catalog_prompt

    @property
    def protocols_prompt(self) -> str:
        return self._skills_manager.protocols_prompt

    @property
    def colony_id(self) -> str:
        return self._colony_id

    @property
    def agent_id(self) -> str:
        return self._colony_id

    @property
    def goal(self) -> Goal:
        """The colony's overall goal.

        Exposed as a public property for queen lifecycle tools that
        introspect the runtime (e.g. ``get_worker_status``,
        ``get_goal_progress``). Previously only available as the private
        ``_goal`` attribute.
        """
        return self._goal

    @property
    def overseer(self) -> Worker | None:
        """The colony's long-running client-facing overseer worker.

        ``None`` until ``start_overseer()`` has been called. The overseer
        is a persistent ``Worker`` that wraps the queen's ``AgentLoop``
        and routes user chat via ``inject(message)``.
        """
        return self._overseer

    @property
    def is_running(self) -> bool:
        return self._running

    @property
    def event_bus(self) -> EventBus:
        return self._event_bus

    @property
    def timers_paused(self) -> bool:
        return self._timers_paused

    @property
    def user_idle_seconds(self) -> float:
        if self._last_user_input_time == 0.0:
            return float("inf")
        return time.monotonic() - self._last_user_input_time

    @property
    def agent_idle_seconds(self) -> float:
        if not self._workers:
            return float("inf")
        min_idle = float("inf")
        now = time.monotonic()
        for w in self._workers.values():
            if w.is_active and w._started_at > 0:
                idle = now - w._started_at
                if idle < min_idle:
                    min_idle = idle
        bus_idle = now - self._scoped_event_bus.last_activity_time
        return min(min_idle, bus_idle)

    @property
    def active_worker_count(self) -> int:
        return sum(1 for w in self._workers.values() if w.is_active)

    def _apply_pipeline_results(self) -> None:
        for stage in self._pipeline.stages:
            if stage.tool_registry is not None:
                # Register task tools on the same registry every worker
                # pulls from. Done here (not at worker spawn) so the
                # colony's `_tools` snapshot includes them.
                try:
                    from framework.tasks.tools import register_task_tools

                    register_task_tools(stage.tool_registry)
                except Exception:
                    logger.warning(
                        "Failed to register task tools on pipeline registry",
                        exc_info=True,
                    )

                tools = list(stage.tool_registry.get_tools().values())
                if tools:
                    self._tools = tools
                    self._tool_executor = stage.tool_registry.get_executor()
            if stage.llm is not None and self._llm is None:
                self._llm = stage.llm
            if stage.accounts_prompt:
                self._accounts_prompt = stage.accounts_prompt
                self._accounts_data = stage.accounts_data
                self._tool_provider_map = stage.tool_provider_map
            if stage.skills_manager is not None:
                self._skills_manager = stage.skills_manager

    @staticmethod
    def _load_pipeline_from_config():
        from framework.config import get_hive_config
        from framework.pipeline.registry import build_pipeline_from_config
        from framework.pipeline.runner import PipelineRunner

        config = get_hive_config()
        stages_config = config.get("pipeline", {}).get("stages", [])
        if not stages_config:
            return PipelineRunner([])
        return build_pipeline_from_config(stages_config)

    @staticmethod
    def _build_default_skills_config(
        colony_name: str | None,
        queen_id: str | None,
    ) -> SkillsManagerConfig:
        """Assemble a ``SkillsManagerConfig`` that wires in the per-colony /
        per-queen override files and the ``queen_ui`` / ``colony_ui`` scope
        dirs based on the standard ``~/.hive`` layout.

        ``colony_name`` must be an actual on-disk colony name
        (``~/.hive/colonies/{name}/``). DM sessions where the ``colony_id``
        is a session UUID should pass ``None`` so we don't create a stray
        override file under a session identifier.
        """
        from framework.config import COLONIES_DIR, QUEENS_DIR
        from framework.skills.discovery import ExtraScope
        from framework.skills.manager import SkillsManagerConfig

        extras: list[ExtraScope] = []
        queen_overrides_path: Path | None = None
        if queen_id:
            queen_home = QUEENS_DIR / queen_id
            queen_overrides_path = queen_home / "skills_overrides.json"
            extras.append(ExtraScope(directory=queen_home / "skills", label="queen_ui", priority=2))

        colony_overrides_path: Path | None = None
        if colony_name:
            colony_home = COLONIES_DIR / colony_name
            colony_overrides_path = colony_home / "skills_overrides.json"
            # Surface both the new flat ``skills/`` (where new skills are
            # written) and the legacy nested ``.hive/skills/`` (left intact
            # for pre-flatten colonies) as tagged ``colony_ui`` scopes, so
            # UI-created entries resolve with correct provenance regardless
            # of which on-disk layout the colony has.
            extras.append(
                ExtraScope(
                    directory=colony_home / "skills",
                    label="colony_ui",
                    priority=3,
                )
            )
            extras.append(
                ExtraScope(
                    directory=colony_home / ".hive" / "skills",
                    label="colony_ui",
                    priority=3,
                )
            )

        return SkillsManagerConfig(
            queen_id=queen_id,
            queen_overrides_path=queen_overrides_path,
            colony_name=colony_name,
            colony_overrides_path=colony_overrides_path,
            extra_scope_dirs=extras,
            interactive=False,  # HTTP-driven runtimes never prompt for consent
        )

    @property
    def queen_id(self) -> str | None:
        """The queen that owns this runtime, if known."""
        return self._queen_id

    @property
    def colony_name(self) -> str | None:
        """The on-disk colony name (distinct from event-bus scope ``colony_id``)."""
        return self._colony_name

    @property
    def skills_manager(self):
        """Access the live :class:`SkillsManager` (for HTTP handlers)."""
        return self._skills_manager

    async def reload_skills(self) -> dict[str, Any]:
        """Rebuild the catalog after an override change; in-flight workers
        pick up the new catalog on their next iteration via
        ``dynamic_skills_catalog_provider``.

        Returns a small stats dict that HTTP handlers can echo back to
        the UI ("applied — N skills now in catalog").
        """
        async with self._skills_manager.mutation_lock:
            self._skills_manager.reload()
            self.skill_dirs = self._skills_manager.allowlisted_dirs
            self.batch_init_nudge = self._skills_manager.batch_init_nudge
            self.context_warn_ratio = self._skills_manager.context_warn_ratio
            catalog_prompt = self._skills_manager.skills_catalog_prompt
            return {
                "catalog_chars": len(catalog_prompt),
                "skill_dirs": list(self.skill_dirs),
            }

    # ── Per-colony tool allowlist ───────────────────────────────

    def set_tool_allowlist(
        self,
        enabled_mcp_tools: list[str] | None,
        mcp_tool_names_all: set[str] | None = None,
    ) -> None:
        """Configure the per-colony MCP tool allowlist.

        Called at construction time (from SessionManager) and again from
        the ``/api/colony/{name}/tools`` PATCH handler when a user edits
        the allowlist. The change applies to the NEXT worker spawn — we
        never mutate the tool list of a worker that is already running
        (workers have no dynamic tools provider, so hot-reloading their
        tool set would diverge from the list the LLM was already using).
        """
        self._enabled_mcp_tools = list(enabled_mcp_tools) if enabled_mcp_tools is not None else None
        if mcp_tool_names_all is not None:
            self._mcp_tool_names_all = set(mcp_tool_names_all)

    def _apply_tool_allowlist(self, tools: list) -> list:
        """Filter ``tools`` against the colony's MCP allowlist.

        Lifecycle / synthetic tools (those whose names are NOT in
        ``_mcp_tool_names_all``) are never gated. MCP tools are kept only
        when ``_enabled_mcp_tools`` is None (default allow) or contains
        their name. Input list order is preserved so downstream cache
        keys and logs stay stable.
        """
        if self._enabled_mcp_tools is None:
            return tools
        allowed = set(self._enabled_mcp_tools)
        return [
            t
            for t in tools
            if getattr(t, "name", None) not in self._mcp_tool_names_all or getattr(t, "name", None) in allowed
        ]

    # ── Lifecycle ───────────────────────────────────────────────

    async def start(self) -> None:
        if self._running:
            return

        async with self._lock:
            await self._storage.start()
            await self._pipeline.initialize_all()
            self._apply_pipeline_results()

            if self._config.webhook_routes:
                from framework.host.webhook_server import (
                    WebhookRoute,
                    WebhookServer,
                    WebhookServerConfig,
                )

                wh_config = WebhookServerConfig(
                    host=self._config.webhook_host,
                    port=self._config.webhook_port,
                )
                self._webhook_server = WebhookServer(self._event_bus, wh_config)
                for rc in self._config.webhook_routes:
                    route = WebhookRoute(
                        source_id=rc["source_id"],
                        path=rc["path"],
                        methods=rc.get("methods", ["POST"]),
                        secret=rc.get("secret"),
                    )
                    self._webhook_server.add_route(route)
                await self._webhook_server.start()

            await self._start_timers()
            await self._skills_manager.start_watching()

            self._running = True
            self._timers_paused = False
            logger.info(
                "ColonyRuntime started: colony_id=%s, triggers=%d",
                self._colony_id,
                len(self._triggers),
            )

    async def stop(self) -> None:
        if not self._running:
            return

        async with self._lock:
            await self.stop_all_workers()

            # Cancel timer tasks and *wait* for them to finish. Without
            # the wait the tasks are merely scheduled for cancellation —
            # if the runtime (or its event loop) shuts down before they
            # run their cleanup code, trigger state leaks.
            pending_timers = [t for t in self._timer_tasks if not t.done()]
            for task in pending_timers:
                task.cancel()
            if pending_timers:
                try:
                    await asyncio.wait_for(
                        asyncio.gather(*pending_timers, return_exceptions=True),
                        timeout=5.0,
                    )
                except TimeoutError:
                    logger.warning(
                        "ColonyRuntime.stop: %d timer task(s) did not finish within 5s",
                        sum(1 for t in pending_timers if not t.done()),
                    )
            self._timer_tasks.clear()

            for sub_id in self._event_subscriptions:
                self._event_bus.unsubscribe(sub_id)
            self._event_subscriptions.clear()

            if self._webhook_server:
                await self._webhook_server.stop()
                self._webhook_server = None

            await self._skills_manager.stop_watching()
            await self._storage.stop()

            self._running = False
            logger.info("ColonyRuntime stopped: colony_id=%s", self._colony_id)

    def _on_timer_task_done(self, task: asyncio.Task) -> None:
        if task.cancelled():
            return
        exc = task.exception()
        if exc is not None:
            logger.error(
                "Timer task '%s' crashed: %s",
                task.get_name(),
                exc,
                exc_info=exc,
            )

    def pause_timers(self) -> None:
        self._timers_paused = True

    def resume_timers(self) -> None:
        self._timers_paused = False

    async def _fork_parent_conversation(
        self,
        dest_conv_dir: Path,
        *,
        task: str,
        input_data: dict[str, Any] | None = None,
    ) -> None:
        """Fork the colony's parent queen conversation into ``dest_conv_dir``.

        Copies the queen's ``parts/*.json`` and ``meta.json`` into the
        worker's fresh conversation dir, then appends a synthetic user
        message carrying the new task. The worker's subsequent
        ``AgentLoop._restore`` reads this conversation via the usual
        path — the queen's history is visible as prior turns, the task
        appears as the most recent user message, and the worker starts
        acting on it with full context.

        This is a no-op if the colony runtime doesn't own a parent
        queen conversation (e.g. a standalone colony started without a
        queen wrapper).

        Notes on filtering compatibility:
          - Queen parts have ``phase_id=None``. When the worker's
            restore applies its own phase filter, the backward-compat
            fallback in NodeConversation.restore kicks in: an
            all-None-phased store bypasses the filter. See
            ``conversation.py:1369-1378``.
          - ``cursor.json`` is deliberately NOT copied. The worker
            should start fresh at iteration 0; copying the queen's
            cursor would make the worker think it had already done
            work.
          - The queen's ``meta.json`` is copied but the AgentLoop
            immediately rebuilds ``system_prompt`` from the worker's
            own context post-restore (see agent_loop.py:533-535), so
            the queen's system prompt does not leak into the worker.
        """
        # Resolve the queen's own conversation dir. For a queen-backed
        # ColonyRuntime, storage_path points at the queen's session dir
        # and conversations/ lives inside it. For standalone runtimes
        # (tests, legacy fork path under ~/.hive/agents/{name}/worker/)
        # there's no parent conversation — fall through to the fresh
        # spawn path.
        src_conv_dir = self._storage_path / "conversations"
        src_parts_dir = src_conv_dir / "parts"
        if not src_parts_dir.exists():
            # No queen conversation to inherit — the worker starts with
            # only the task, same as the pre-fork behavior. AgentLoop's
            # fresh-conversation branch will call _build_initial_message
            # and render input_data into the worker's first user message.
            return

        def _copy_and_append() -> None:
            dest_parts = dest_conv_dir / "parts"
            dest_parts.mkdir(parents=True, exist_ok=True)

            # Copy each queen part. Use json.dumps round-trip (not raw
            # file copy) so we can be defensive about unreadable files —
            # a corrupted queen part file shouldn't take down the worker
            # spawn, just drop that one part.
            max_seq = -1
            for part_file in sorted(src_parts_dir.glob("*.json")):
                try:
                    data = json.loads(part_file.read_text(encoding="utf-8"))
                except (json.JSONDecodeError, OSError) as exc:
                    logger.warning(
                        "spawn fork: skipping unreadable queen part %s: %s",
                        part_file.name,
                        exc,
                    )
                    continue
                seq = data.get("seq")
                if isinstance(seq, int) and seq > max_seq:
                    max_seq = seq
                (dest_parts / part_file.name).write_text(
                    json.dumps(data, ensure_ascii=False),
                    encoding="utf-8",
                )

            # Copy the queen's meta.json so the worker's restore finds
            # the conversation during its first run. The meta fields
            # (system_prompt, max_context_tokens, etc.) get overridden
            # by the worker's own AgentLoop config + context after
            # restore, so nothing here bleeds into runtime behavior.
            src_meta = src_conv_dir / "meta.json"
            if src_meta.exists():
                try:
                    meta_data = json.loads(src_meta.read_text(encoding="utf-8"))
                    (dest_conv_dir / "meta.json").write_text(
                        json.dumps(meta_data, ensure_ascii=False),
                        encoding="utf-8",
                    )
                except (json.JSONDecodeError, OSError) as exc:
                    logger.warning("spawn fork: failed to copy queen meta.json: %s", exc)

            # Append the task as the next user message so the worker's
            # LLM sees it as the most recent turn in the conversation
            # after restore. This replaces the fresh-path call to
            # _build_initial_message for spawned workers.
            task_content = _format_spawn_task_message(task, input_data or {})
            next_seq = max_seq + 1
            task_part = {
                "seq": next_seq,
                "role": "user",
                "content": task_content,
                # phase_id omitted (None) so the backward-compat
                # fallback in NodeConversation.restore keeps it visible
                # to both queen-style and phase-filtered restores.
                # run_id omitted so the worker's run_id filter (off by
                # default since ctx.run_id is empty) doesn't reject it.
            }
            task_filename = f"{next_seq:010d}.json"
            (dest_parts / task_filename).write_text(
                json.dumps(task_part, ensure_ascii=False),
                encoding="utf-8",
            )
            logger.info(
                "spawn fork: inherited %d queen parts + appended task at seq %d",
                max_seq + 1,
                next_seq,
            )

        await asyncio.to_thread(_copy_and_append)

    # ── Worker Spawning ─────────────────────────────────────────

    async def spawn(
        self,
        task: str,
        count: int = 1,
        input_data: dict[str, Any] | None = None,
        session_state: dict[str, Any] | None = None,
        agent_spec: AgentSpec | None = None,
        tools: list[Any] | None = None,
        tool_executor: Callable | None = None,
        stream_id: str | None = None,
    ) -> list[str]:
        """Spawn worker clones and start them in the background.

        By default each spawn uses the colony's own ``agent_spec``,
        ``tools``, and ``tool_executor`` (set at construction). Pass
        the per-spawn override args to spawn a worker that runs
        DIFFERENT code from the colony default — used by the queen's
        ``run_agent_with_input`` tool to spawn the loaded honeycomb /
        custom worker through the unified runtime, instead of going
        through the deprecated ``AgentHost.trigger`` → ``Orchestrator``
        path that silently dropped ``user_request`` via the buffer
        filter.

        ``stream_id`` controls the SSE stream tag the worker's events
        publish under. Default is ``f"worker:{worker_id}"`` (the
        per-spawn unique tag used by parallel fan-out, which the SSE
        filter at routes_events.py drops to keep the queen DM clean
        of worker noise). Pass an explicit value when you want the
        worker's events to bypass that filter and stream to the queen
        DM. ``run_agent_with_input`` passes ``"worker"`` (singular,
        no colon) so the loaded primary worker's tool calls and LLM
        deltas reach the user's chat tab.

        Returns list of worker IDs.
        """
        if not self._running:
            raise RuntimeError("ColonyRuntime is not running")

        from framework.agent_loop.agent_loop import AgentLoop
        from framework.storage.conversation_store import FileConversationStore

        # Resolve per-spawn vs colony-default code identity
        spawn_spec = agent_spec or self._agent_spec
        spawn_tools = tools if tools is not None else self._tools
        spawn_executor = tool_executor or self._tool_executor

        # Apply the per-colony MCP tool allowlist (if any). Done HERE —
        # after spawn_tools is resolved but before it's frozen into the
        # worker's AgentContext — so the next spawn reflects any PATCH
        # that happened since the last spawn. A value of ``None`` on
        # ``_enabled_mcp_tools`` is a no-op so the default path is
        # unchanged.
        spawn_tools = self._apply_tool_allowlist(spawn_tools)

        # Colony progress tracker: when the caller supplied a db_path
        # in input_data, this worker is part of a SQLite task queue
        # and must see the hive.colony-progress-tracker skill body in
        # its system prompt from turn 0. Rebuild the catalog with the
        # skill pre-activated; falls back to the colony default when
        # no db_path is present.
        _spawn_catalog = self.skills_catalog_prompt
        _spawn_skill_dirs = self.skill_dirs
        if isinstance(input_data, dict) and input_data.get("db_path"):
            try:
                from framework.skills.config import SkillsConfig
                from framework.skills.manager import SkillsManager, SkillsManagerConfig

                _pre = SkillsManager(
                    SkillsManagerConfig(
                        skills_config=SkillsConfig.from_agent_vars(
                            skills=["hive.colony-progress-tracker"],
                        ),
                    )
                )
                _pre.load()
                _spawn_catalog = _pre.skills_catalog_prompt
                _spawn_skill_dirs = (
                    list(_pre.allowlisted_dirs) if hasattr(_pre, "allowlisted_dirs") else self.skill_dirs
                )
                logger.info(
                    "spawn: pre-activated hive.colony-progress-tracker "
                    "(catalog %d → %d chars) for worker with db_path=%s",
                    len(self.skills_catalog_prompt),
                    len(_spawn_catalog),
                    input_data.get("db_path"),
                )
            except Exception as exc:
                logger.warning(
                    "spawn: failed to pre-activate colony-progress-tracker skill, falling back to base catalog: %s",
                    exc,
                )

        # Resolve the SSE stream_id once. When the caller didn't supply
        # one we use the per-worker fan-out tag (filtered out by the
        # SSE handler). When the caller passed an explicit value we
        # honor it across the whole batch — typically count=1 for the
        # primary loaded worker that needs to stream to the queen DM.
        explicit_stream_id = stream_id

        worker_ids = []
        for i in range(count):
            worker_id = self._session_store.generate_session_id()

            # Each parallel worker gets its own storage dir under
            # {colony_session}/workers/{worker_id}/ so its conversation,
            # events, and data never leak into the overseer's tree or
            # (worse) the process CWD.
            worker_storage = self._storage_path / "workers" / worker_id
            worker_storage.mkdir(parents=True, exist_ok=True)

            # Fork the queen's conversation into the worker's store.
            # The queen already accumulated the user chat, read relevant
            # skills, and made decisions about how to approach the task;
            # the worker would repeat that discovery work (and often
            # mis-step — see the 2026-04-14 "dummy-target" incident)
            # if spawned with a blank store. We snapshot the queen's
            # parts + meta at spawn time, then append the task as the
            # next user message so the worker's AgentLoop restores into
            # a conversation that already ends with its new instruction.
            await self._fork_parent_conversation(
                worker_storage / "conversations",
                task=task,
                input_data=input_data,
            )

            worker_conv_store = FileConversationStore(worker_storage / "conversations")

            # AgentLoop takes bus/judge/config/executor at construction;
            # LLM, tools, stream_id, execution_id all come from the
            # AgentContext passed to execute().
            agent_loop = AgentLoop(
                event_bus=self._scoped_event_bus,
                tool_executor=spawn_executor,
                conversation_store=worker_conv_store,
            )

            # Workers pick up UI-driven override changes via this provider,
            # which reads the live catalog on each iteration. The db_path
            # pre-activated catalog stays static because its contents are
            # built for *this* worker's task (a tombstone toggle from the
            # UI should not yank it mid-run).
            _db_path_pre_activated = bool(isinstance(input_data, dict) and input_data.get("db_path"))
            # Default-bind the manager into the closure so each loop iteration
            # captures the same manager instance — pyflakes B023 would flag a
            # free-variable capture here.
            _provider = None if _db_path_pre_activated else (lambda mgr=self._skills_manager: mgr.skills_catalog_prompt)

            # Task-system fields. Each worker owns its session task list;
            # picked_up_from records the colony template entry it was
            # spawned for, when applicable.
            from framework.tasks.scoping import (
                colony_task_list_id as _colony_list_id,
                session_task_list_id as _session_list_id,
            )

            _worker_list_id = _session_list_id(worker_id, worker_id)
            _picked_up = None
            _template_id = input_data.get("__template_task_id") if isinstance(input_data, dict) else None
            if _template_id is not None:
                try:
                    _picked_up = (_colony_list_id(self._colony_id), int(_template_id))
                except (TypeError, ValueError):
                    _picked_up = None

            agent_context = AgentContext(
                runtime=self._make_runtime_adapter(worker_id),
                agent_id=worker_id,
                agent_spec=spawn_spec,
                input_data=input_data or {"task": task},
                goal_context=self._goal.to_prompt_context(),
                goal=self._goal,
                llm=self._llm,
                available_tools=list(spawn_tools),
                accounts_prompt=self._accounts_prompt,
                skills_catalog_prompt=_spawn_catalog,
                protocols_prompt=self.protocols_prompt,
                skill_dirs=_spawn_skill_dirs,
                dynamic_skills_catalog_provider=_provider,
                execution_id=worker_id,
                stream_id=explicit_stream_id or f"worker:{worker_id}",
                task_list_id=_worker_list_id,
                colony_id=self._colony_id,
                picked_up_from=_picked_up,
            )

            worker = Worker(
                worker_id=worker_id,
                task=task,
                agent_loop=agent_loop,
                context=agent_context,
                event_bus=self._scoped_event_bus,
                colony_id=self._colony_id,
                storage_path=worker_storage,
            )

            self._workers[worker_id] = worker
            await worker.start_background()
            worker_ids.append(worker_id)

            logger.info(
                "Spawned worker %s (%d/%d) using %s — task: %s",
                worker_id,
                i + 1,
                count,
                "override spec" if agent_spec else "colony default spec",
                task[:80],
            )

        return worker_ids

    async def spawn_batch(
        self,
        tasks: list[dict[str, Any]],
        *,
        tools_override: list[Any] | None = None,
    ) -> list[str]:
        """Spawn a batch of parallel workers, one per task spec.

        Each task spec is a dict ``{"task": str, "data": dict | None}``.
        Workers start as independent asyncio background tasks and run
        concurrently; this method returns their IDs immediately without
        waiting for completion. Use ``wait_for_worker_reports(ids,
        timeout)`` to block until they all finish.

        The overseer's ``run_parallel_workers`` tool is the usual
        caller; it pairs ``spawn_batch`` + ``wait_for_worker_reports``
        into a single fan-out/fan-in primitive.

        When ``tools_override`` is supplied, every spawned worker
        receives that tool list instead of the colony's default.  Used
        by ``run_parallel_workers`` to drop tools whose credentials
        failed the pre-flight check (so the spawned workers don't
        waste a startup trying to use them).
        """
        worker_ids: list[str] = []
        for spec in tasks:
            task_text = str(spec.get("task", ""))
            task_data = spec.get("data")
            if task_data is not None and not isinstance(task_data, dict):
                task_data = {"value": task_data}
            ids = await self.spawn(
                task=task_text,
                count=1,
                input_data=task_data or {"task": task_text},
                tools=tools_override,
            )
            worker_ids.extend(ids)
        return worker_ids

    async def wait_for_worker_reports(
        self,
        worker_ids: list[str],
        timeout: float = 600.0,
    ) -> list[dict[str, Any]]:
        """Block until every worker in ``worker_ids`` has reported.

        Subscribes to ``SUBAGENT_REPORT`` events on the colony event bus
        and collects one report per worker. If a worker has already
        reported (fast completion) the existing ``WorkerResult`` is used
        directly. On timeout, still-running workers are force-stopped
        via ``stop_worker`` and their reports are synthesised as
        ``status="timeout"``.

        Returns a list of report dicts in the same order as
        ``worker_ids``::

            [
                {
                    "worker_id": "...",
                    "status": "success" | "partial" | "failed" | "timeout" | "stopped",
                    "summary": "...",
                    "data": {...},
                    "error": "..." | None,
                    "duration_seconds": 12.3,
                    "tokens_used": 4567,
                },
                ...
            ]
        """
        if not worker_ids:
            return []

        # Reports already in hand (workers that finished before we got here)
        collected: dict[str, dict[str, Any]] = {}
        pending_ids: set[str] = set()

        for wid in worker_ids:
            worker = self._workers.get(wid)
            if worker is None:
                collected[wid] = {
                    "worker_id": wid,
                    "status": "failed",
                    "summary": "Worker not found in registry.",
                    "data": {},
                    "error": "no_such_worker",
                    "duration_seconds": 0.0,
                    "tokens_used": 0,
                }
                continue
            if not worker.is_active and worker._result is not None:
                # Already finished — synthesize from the stored result
                r = worker._result
                collected[wid] = {
                    "worker_id": wid,
                    "status": r.status,
                    "summary": r.summary,
                    "data": r.data,
                    "error": r.error,
                    "duration_seconds": r.duration_seconds,
                    "tokens_used": r.tokens_used,
                }
                continue
            pending_ids.add(wid)

        if not pending_ids:
            return [collected[wid] for wid in worker_ids]

        # Subscribe to SUBAGENT_REPORT events for the remaining workers
        report_queue: asyncio.Queue[dict[str, Any]] = asyncio.Queue()

        async def on_report(event: AgentEvent) -> None:
            data = dict(event.data or {})
            wid = data.get("worker_id")
            if wid and wid in pending_ids:
                await report_queue.put(data)

        sub_id = self._scoped_event_bus.subscribe(
            event_types=[EventType.SUBAGENT_REPORT],
            handler=on_report,
        )

        deadline = time.monotonic() + timeout
        try:
            while pending_ids:
                remaining = deadline - time.monotonic()
                if remaining <= 0:
                    break
                try:
                    report = await asyncio.wait_for(report_queue.get(), timeout=remaining)
                except TimeoutError:
                    break
                wid = report.get("worker_id")
                if wid in pending_ids:
                    collected[wid] = report
                    pending_ids.discard(wid)
        finally:
            self._scoped_event_bus.unsubscribe(sub_id)

        # Any still-pending workers are timed out — force-stop them and
        # synthesise a timeout report.
        for wid in list(pending_ids):
            try:
                await self.stop_worker(wid)
            except Exception:
                logger.exception("Failed to force-stop worker %s on timeout", wid)
            worker = self._workers.get(wid)
            duration = 0.0
            tokens = 0
            if worker is not None and worker._started_at > 0:
                duration = time.monotonic() - worker._started_at
            if worker is not None and worker._result is not None:
                tokens = worker._result.tokens_used
            collected[wid] = {
                "worker_id": wid,
                "status": "timeout",
                "summary": f"Worker did not report within {timeout:.0f}s.",
                "data": {},
                "error": "timeout",
                "duration_seconds": duration,
                "tokens_used": tokens,
            }
            pending_ids.discard(wid)

        return [collected[wid] for wid in worker_ids]

    async def start_overseer(
        self,
        queen_spec: AgentSpec,
        seed_conversation: list[dict[str, Any]] | None = None,
        queen_tools: list[Any] | None = None,
        initial_prompt: str | None = None,
    ) -> Worker:
        """Start the colony's long-running client-facing overseer.

        The overseer is a persistent ``Worker`` that wraps the queen's
        ``AgentLoop`` and:

        - Never terminates on its own (``persistent=True`` on the Worker).
        - Has the queen's full tool set, streamed with ``stream_id="overseer"``.
        - Receives user chat via ``session.colony_runtime.overseer.inject(msg)``.

        In a queen DM session the overseer runs with 0 parallel workers.
        In a colony session she can spawn parallel workers via the
        ``run_parallel_workers`` tool which calls ``spawn_batch`` +
        ``wait_for_worker_reports`` under the hood.

        Pass ``seed_conversation`` to pre-populate the overseer's
        conversation history — used when forking a DM to a colony so
        the overseer starts with the DM's prior context loaded.

        Must be called after ``start()``. Idempotent: calling a second
        time returns the already-started overseer.
        """
        if self._overseer is not None:
            return self._overseer

        if not self._running:
            raise RuntimeError("start_overseer requires the ColonyRuntime to be running (call start() first)")

        from framework.agent_loop.agent_loop import AgentLoop
        from framework.storage.conversation_store import FileConversationStore

        overseer_id = f"overseer:{self._colony_id}"

        # The overseer's conversation lives at the colony session root:
        # {colony_session}/conversations/. Workers get their own sub-dirs
        # under workers/{worker_id}/; the overseer is the root occupant.
        self._storage_path.mkdir(parents=True, exist_ok=True)
        overseer_conv_store = FileConversationStore(self._storage_path / "conversations")
        agent_loop = AgentLoop(
            event_bus=self._scoped_event_bus,
            tool_executor=self._tool_executor,
            conversation_store=overseer_conv_store,
        )

        _overseer_skills_mgr = self._skills_manager
        overseer_ctx = AgentContext(
            runtime=self._make_runtime_adapter(overseer_id),
            agent_id=overseer_id,
            agent_spec=queen_spec,
            input_data={},
            goal_context="",
            goal=self._goal,
            llm=self._llm,
            available_tools=list(queen_tools or self._tools),
            accounts_prompt=self._accounts_prompt,
            skills_catalog_prompt=self.skills_catalog_prompt,
            protocols_prompt=self.protocols_prompt,
            skill_dirs=self.skill_dirs,
            dynamic_skills_catalog_provider=lambda: _overseer_skills_mgr.skills_catalog_prompt,
            execution_id=overseer_id,
            stream_id="overseer",
        )

        overseer = Worker(
            worker_id=overseer_id,
            task="",  # no finite task — persistent conversation
            agent_loop=agent_loop,
            context=overseer_ctx,
            event_bus=self._scoped_event_bus,
            colony_id=self._colony_id,
            persistent=True,
            storage_path=self._storage_path,
        )

        if seed_conversation:
            await overseer.seed_conversation(seed_conversation)

        self._overseer = overseer
        await overseer.start_background()

        if initial_prompt:
            await overseer.inject(initial_prompt)

        logger.info(
            "Started overseer %s for colony %s (seeded=%d messages, initial_prompt=%s)",
            overseer_id,
            self._colony_id,
            len(seed_conversation or []),
            "yes" if initial_prompt else "no",
        )
        return overseer

    async def trigger(
        self,
        trigger_id: str,
        input_data: dict[str, Any],
        correlation_id: str | None = None,
        session_state: dict[str, Any] | None = None,
        idempotency_key: str | None = None,
    ) -> str:
        """Trigger a worker spawn from a trigger definition.

        Non-blocking — returns worker ID immediately.
        """
        if not self._running:
            raise RuntimeError("ColonyRuntime is not running")

        if idempotency_key is not None:
            self._prune_idempotency_keys()
            cached = self._idempotency_keys.get(idempotency_key)
            if cached is not None:
                return cached

        if self._pipeline.stages:
            from framework.pipeline.stage import PipelineContext

            pipeline_ctx = PipelineContext(
                entry_point_id=trigger_id,
                input_data=input_data,
                correlation_id=correlation_id,
                session_state=session_state,
            )
            pipeline_ctx = await self._pipeline.run(pipeline_ctx)
            input_data = pipeline_ctx.input_data

        task = input_data.get("task", json.dumps(input_data))
        worker_ids = await self.spawn(
            task=task,
            count=1,
            input_data=input_data,
            session_state=session_state,
        )

        worker_id = worker_ids[0] if worker_ids else ""

        if idempotency_key is not None and worker_id:
            self._idempotency_keys[idempotency_key] = worker_id
            self._idempotency_times[idempotency_key] = time.time()

        return worker_id

    async def trigger_and_wait(
        self,
        trigger_id: str,
        input_data: dict[str, Any],
        timeout: float | None = None,
        session_state: dict[str, Any] | None = None,
    ) -> WorkerResult | None:
        worker_id = await self.trigger(trigger_id, input_data, session_state=session_state)
        if not worker_id:
            return None
        return await self.wait_for_worker(worker_id, timeout)

    # ── Worker Control ──────────────────────────────────────────

    async def stop_worker(self, worker_id: str) -> None:
        worker = self._workers.get(worker_id)
        if worker:
            await worker.stop()
            logger.info("Stopped worker %s", worker_id)

    async def stop_all_workers(self) -> None:
        tasks = []
        for worker in self._workers.values():
            if worker.is_active:
                tasks.append(worker.stop())
        if tasks:
            await asyncio.gather(*tasks, return_exceptions=True)
        self._workers.clear()

    async def send_to_worker(self, worker_id: str, message: str) -> bool:
        worker = self._workers.get(worker_id)
        if worker and worker.is_active:
            await worker.inject(message)
            return True
        return False

    def watch_batch_timeouts(
        self,
        worker_ids: list[str],
        *,
        soft_timeout: float,
        hard_timeout: float,
        warning_message: str | None = None,
    ) -> asyncio.Task:
        """Schedule a background task that enforces soft + hard timeouts.

        Semantics:
          * At ``t = soft_timeout`` every worker in ``worker_ids`` that is
            still active AND hasn't already filed an ``_explicit_report``
            receives ``warning_message`` via ``send_to_worker`` — the inject
            appears as a user turn at the next agent-loop boundary, so the
            worker's LLM can see it and call ``report_to_parent`` with
            partial results.
          * At ``t = hard_timeout`` any worker still active is force-stopped
            via ``stop_worker``. ``Worker.run`` still emits its
            ``SUBAGENT_REPORT`` on cancel (the explicit report survives,
            if the worker reported just before the stop) so the queen
            always sees a terminal inject for every spawned worker.

        Returns the scheduled task so callers can await or cancel it.
        Non-blocking for the caller — the watcher runs on the event loop
        independently.
        """
        if warning_message is None:
            grace = max(0.0, hard_timeout - soft_timeout)
            warning_message = (
                f"[SOFT TIMEOUT] You've been running for {soft_timeout:.0f}s. "
                "Wrap up now: call report_to_parent with whatever partial "
                "results you have. You have "
                f"~{grace:.0f}s more before a hard stop — anything not "
                "reported by then will be lost."
            )

        async def _watch() -> None:
            try:
                await asyncio.sleep(soft_timeout)
                for wid in worker_ids:
                    worker = self._workers.get(wid)
                    if worker is None or not worker.is_active:
                        continue
                    if getattr(worker, "_explicit_report", None) is not None:
                        continue
                    try:
                        await self.send_to_worker(wid, warning_message)
                    except Exception:
                        logger.warning(
                            "watch_batch_timeouts: soft-timeout inject failed for %s",
                            wid,
                            exc_info=True,
                        )

                remaining = hard_timeout - soft_timeout
                if remaining <= 0:
                    return
                await asyncio.sleep(remaining)
                for wid in worker_ids:
                    worker = self._workers.get(wid)
                    if worker is None or not worker.is_active:
                        continue
                    try:
                        await self.stop_worker(wid)
                        logger.info(
                            "watch_batch_timeouts: hard-stopped %s after %ss (no report)",
                            wid,
                            hard_timeout,
                        )
                    except Exception:
                        logger.warning(
                            "watch_batch_timeouts: hard-stop failed for %s",
                            wid,
                            exc_info=True,
                        )
            except asyncio.CancelledError:
                raise
            except Exception:
                logger.exception("watch_batch_timeouts: watcher crashed")

        task = asyncio.create_task(_watch(), name=f"batch-timeout:{worker_ids[0] if worker_ids else '?'}")
        # Hold a strong reference until completion. Without this the
        # task can be garbage-collected during `await asyncio.sleep`,
        # silently swallowing the soft-timeout inject (the exact bug
        # surfaced by workers never seeing [SOFT TIMEOUT]).
        self._background_tasks.add(task)
        task.add_done_callback(self._background_tasks.discard)
        return task

    # ── Status & Query ──────────────────────────────────────────

    def list_workers(self) -> list[WorkerInfo]:
        return [w.info for w in self._workers.values()]

    def get_worker(self, worker_id: str) -> Worker | None:
        return self._workers.get(worker_id)

    def list_triggers(self) -> list[TriggerSpec]:
        return list(self._triggers.values())

    def get_entry_points(self) -> list[TriggerSpec]:
        return list(self._triggers.values())

    def get_timer_next_fire_in(self, trigger_id: str) -> float | None:
        mono = self._timer_next_fire.get(trigger_id)
        if mono is not None:
            return max(0.0, mono - time.monotonic())
        return None

    def get_worker_result(self, worker_id: str) -> WorkerResult | None:
        return self._execution_results.get(worker_id)

    async def wait_for_worker(self, worker_id: str, timeout: float | None = None) -> WorkerResult | None:
        worker = self._workers.get(worker_id)
        if worker is None:
            return self._execution_results.get(worker_id)
        if worker._task_handle is None:
            return worker.info.result
        try:
            await asyncio.wait_for(asyncio.shield(worker._task_handle), timeout=timeout)
        except TimeoutError:
            return None
        return worker.info.result

    def get_stats(self) -> dict:
        return {
            "running": self._running,
            "colony_id": self._colony_id,
            "active_workers": self.active_worker_count,
            "total_workers": len(self._workers),
            "triggers": len(self._triggers),
            "event_bus": self._event_bus.get_stats(),
        }

    def get_active_streams(self) -> list[dict[str, Any]]:
        result = []
        for wid, worker in self._workers.items():
            if worker.is_active:
                result.append(
                    {
                        "colony_id": self._colony_id,
                        "worker_id": wid,
                        "status": worker.status.value,
                        "task": worker.task[:100],
                    }
                )
        return result

    async def inject_input(
        self,
        worker_id: str,
        content: str,
        *,
        is_client_input: bool = False,
        image_content: list[dict[str, Any]] | None = None,
    ) -> bool:
        self._last_user_input_time = time.monotonic()
        worker = self._workers.get(worker_id)
        if worker and worker.is_active:
            loop = worker._agent_loop
            if hasattr(loop, "inject_event"):
                await loop.inject_event(content, is_client_input=is_client_input, image_content=image_content)
                return True
        return False

    # ── Event Subscriptions ─────────────────────────────────────

    def subscribe_to_events(
        self,
        event_types: list,
        handler: Callable,
        filter_stream: str | None = None,
        filter_colony: str | None = None,
    ) -> str:
        return self._event_bus.subscribe(
            event_types=event_types,
            handler=handler,
            filter_stream=filter_stream,
            filter_colony=filter_colony,
        )

    def unsubscribe_from_events(self, subscription_id: str) -> bool:
        return self._event_bus.unsubscribe(subscription_id)

    # ── Trigger Registration ────────────────────────────────────

    def register_trigger(self, spec: TriggerSpec) -> None:
        if self._running:
            raise RuntimeError("Cannot register triggers while runtime is running")
        if spec.id in self._triggers:
            raise ValueError(f"Trigger '{spec.id}' already registered")
        self._triggers[spec.id] = spec
        logger.info("Registered trigger: %s (%s)", spec.id, spec.trigger_type)

    def unregister_trigger(self, trigger_id: str) -> bool:
        if self._running:
            raise RuntimeError("Cannot unregister triggers while runtime is running")
        return self._triggers.pop(trigger_id, None) is not None

    # ── Internal Helpers ────────────────────────────────────────

    def _make_runtime_adapter(self, worker_id: str):
        from framework.host.stream_runtime import StreamDecisionTracker

        return StreamDecisionTracker(
            stream_id=f"worker:{worker_id}",
            storage=self._storage,
        )

    def _prune_idempotency_keys(self) -> None:
        ttl = self._config.idempotency_ttl_seconds
        if ttl > 0:
            cutoff = time.time() - ttl
            for key, recorded_at in list(self._idempotency_times.items()):
                if recorded_at < cutoff:
                    self._idempotency_times.pop(key, None)
                    self._idempotency_keys.pop(key, None)
        max_keys = self._config.idempotency_max_keys
        if max_keys > 0:
            while len(self._idempotency_keys) > max_keys:
                old_key, _ = self._idempotency_keys.popitem(last=False)
                self._idempotency_times.pop(old_key, None)

    async def _start_timers(self) -> None:
        for trig_id, spec in self._triggers.items():
            if spec.trigger_type != "timer":
                continue
            tc = spec.trigger_config
            _raw_interval = tc.get("interval_minutes")
            interval = float(_raw_interval) if _raw_interval is not None else None
            run_immediately = tc.get("run_immediately", False)

            if interval and interval > 0 and self._running:
                task = asyncio.create_task(
                    self._timer_loop(trig_id, interval, run_immediately),
                    name=f"timer:{trig_id}",
                )
                task.add_done_callback(self._on_timer_task_done)
                self._timer_tasks.append(task)

    async def _timer_loop(
        self,
        trigger_id: str,
        interval_minutes: float,
        immediate: bool,
        idle_timeout: float = 300,
    ) -> None:
        interval_secs = interval_minutes * 60
        if not immediate:
            self._timer_next_fire[trigger_id] = time.monotonic() + interval_secs
            await asyncio.sleep(interval_secs)

        while self._running:
            if self._timers_paused:
                self._timer_next_fire[trigger_id] = time.monotonic() + interval_secs
                await asyncio.sleep(interval_secs)
                continue

            idle = self.agent_idle_seconds
            if idle < idle_timeout:
                logger.debug("Timer '%s': agent active, skipping", trigger_id)
                self._timer_next_fire[trigger_id] = time.monotonic() + interval_secs
                await asyncio.sleep(interval_secs)
                continue

            self._timer_next_fire.pop(trigger_id, None)
            try:
                await self.trigger(
                    trigger_id,
                    {"event": {"source": "timer", "reason": "scheduled"}},
                )
            except Exception:
                logger.error("Timer trigger failed for '%s'", trigger_id, exc_info=True)

            self._timer_next_fire[trigger_id] = time.monotonic() + interval_secs
            await asyncio.sleep(interval_secs)

    async def cancel_all_tasks_async(self) -> bool:
        cancelled = False
        for worker in self._workers.values():
            if worker._task_handle and not worker._task_handle.done():
                worker._task_handle.cancel()
                cancelled = True
        return cancelled

    def cancel_all_tasks(self, loop: asyncio.AbstractEventLoop) -> bool:
        future = asyncio.run_coroutine_threadsafe(self.cancel_all_tasks_async(), loop)
        try:
            return future.result(timeout=5)
        except Exception:
            logger.warning("cancel_all_tasks: timed out or failed")
            return False

    async def cancel_execution(self, trigger_id: str, worker_id: str) -> bool:
        worker = self._workers.get(worker_id)
        if worker and worker.is_active:
            await worker.stop()
            return True
        return False