Files
hive/core/framework/server/session_manager.py
T
2026-04-21 18:48:22 -07:00

2002 lines
84 KiB
Python

"""Session-primary lifecycle manager for the HTTP API server.
Sessions (queen) are the primary entity. Workers are optional and can be
loaded/unloaded while the queen stays alive.
Architecture:
- Session owns EventBus + LLM, shared with queen and worker
- Queen is always present once a session starts
- Worker is optional — loaded into an existing session
"""
import asyncio
import json
import logging
import time
import uuid
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Literal
from framework.config import QUEENS_DIR
from framework.host.triggers import TriggerDefinition
logger = logging.getLogger(__name__)
def _generate_session_id() -> str:
"""Generate a unique session ID."""
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"session_{ts}_{uuid.uuid4().hex[:8]}"
def _queen_session_dir(session_id: str, queen_name: str = "default") -> Path:
"""Return the on-disk directory for a queen session."""
return QUEENS_DIR / queen_name / "sessions" / session_id
def _find_queen_session_dir(session_id: str) -> Path:
"""Search all queen directories for a session. Falls back to default."""
if QUEENS_DIR.exists():
for queen_dir in QUEENS_DIR.iterdir():
if not queen_dir.is_dir():
continue
candidate = queen_dir / "sessions" / session_id
if candidate.exists():
return candidate
return _queen_session_dir(session_id)
@dataclass
class Session:
"""A live session with a queen and optional worker."""
id: str
event_bus: Any # EventBus — owned by session
llm: Any # LLMProvider — owned by session
loaded_at: float
# Queen (always present once started)
queen_executor: Any = None # GraphExecutor for queen input injection
queen_task: asyncio.Task | None = None
# Loaded colony (optional)
colony_id: str | None = None
worker_path: Path | None = None
runner: Any | None = None # AgentRunner
colony_runtime: Any | None = None # legacy worker AgentRuntime (Phase 2 deprecation pending)
# Phase 2 unified runtime: a real ColonyRuntime hosting the queen as
# overseer and (in colony sessions) parallel workers spawned via
# run_parallel_workers. Always set once _start_queen has run.
colony: Any | None = None # ColonyRuntime
worker_info: Any | None = None # AgentInfo
# Queen phase state (working/reviewing)
phase_state: Any = None # QueenPhaseState
# Worker handoff subscription (colony-scoped escalation receiver)
worker_handoff_sub: str | None = None
# Pending worker escalations awaiting queen reply.
# Keyed by request_id -> {worker_id, colony_id, reason, context, opened_at}.
# Populated by queen_orchestrator._on_worker_escalation and drained by
# the reply_to_worker tool.
pending_escalations: dict[str, dict[str, Any]] = field(default_factory=dict)
# Memory reflection + recall subscriptions (global memory)
memory_reflection_subs: list = field(default_factory=list) # list[str]
# Trigger definitions loaded from agent's triggers.json (available but inactive)
available_triggers: dict[str, TriggerDefinition] = field(default_factory=dict)
# Active trigger tracking (IDs currently firing + their asyncio tasks)
active_trigger_ids: set[str] = field(default_factory=set)
active_timer_tasks: dict[str, asyncio.Task] = field(default_factory=dict)
# Queen-owned webhook server (lazy singleton, created on first webhook trigger activation)
queen_webhook_server: Any = None
# EventBus subscription IDs for active webhook triggers (trigger_id -> sub_id)
active_webhook_subs: dict[str, str] = field(default_factory=dict)
# True after first successful worker execution (gates trigger delivery)
worker_configured: bool = False
# Monotonic timestamps for next trigger fire (mirrors AgentRuntime._timer_next_fire)
trigger_next_fire: dict[str, float] = field(default_factory=dict)
# Per-trigger fire stats (session lifetime): {trigger_id: {"fire_count": int, "last_fired_at": epoch_ms}}.
# Reset on process restart — good enough as a "since this session started" counter.
trigger_fire_stats: dict[str, dict[str, Any]] = field(default_factory=dict)
# Session directory resumption:
# When set, _start_queen writes queen conversations to this existing session's
# directory instead of creating a new one. This lets cold-restores accumulate
# all messages in the original session folder so history is never fragmented.
queen_resume_from: str | None = None
# Queen session directory (set during _start_queen, used for shutdown reflection)
queen_dir: Path | None = None
# Multi-queen support: which queen profile this session uses
queen_name: str = "default"
# Colony name: set when a worker is loaded from a colony
colony_name: str | None = None
# Session mode discriminator. "dm" = queen DM session under
# ~/.hive/agents/queens/{queen_id}/sessions/. "colony" = forked colony
# session under ~/.hive/colonies/{colony_name}/sessions/, with the
# queen running as the colony's overseer and the run_parallel_workers
# tool unlocked. The mode is the canonical discriminator for storage
# path, tool exposure, and SSE filtering — see the Phase 2 plan.
mode: Literal["dm", "colony"] = "dm"
# Set to True after the user clicks the COLONY_CREATED system message
# in this DM. Locks the chat input — the user must compact+fork into a
# fresh session before continuing the conversation. Persisted in
# meta.json so the lock survives server restarts.
colony_spawned: bool = False
spawned_colony_name: str | None = None
class SessionManager:
"""Manages session lifecycles.
Thread-safe via asyncio.Lock. Workers are loaded via run_in_executor
(blocking I/O) then started on the event loop.
"""
def __init__(self, model: str | None = None, credential_store=None, queen_tool_registry=None) -> None:
self._sessions: dict[str, Session] = {}
self._loading: set[str] = set()
self._model = model
self._credential_store = credential_store
self._queen_tool_registry = queen_tool_registry
self._lock = asyncio.Lock()
# Strong references for fire-and-forget background tasks (e.g. shutdown
# reflections) so they aren't garbage-collected before completion.
self._background_tasks: set[asyncio.Task] = set()
# Run one-time v2 directory structure migration
from framework.storage.migrate_v2 import run_migration
try:
run_migration()
except Exception:
logger.warning("v2 migration failed (non-fatal)", exc_info=True)
# Ensure every existing colony has an up-to-date progress.db
# (schema v1, WAL mode) and reclaim any stale claims left behind
# by crashed workers from the previous run. Idempotent and
# fast; runs synchronously because the event loop hasn't
# started yet at __init__ time.
from framework.host.progress_db import ensure_all_colony_dbs
try:
ensured = ensure_all_colony_dbs()
if ensured:
logger.info("progress_db: ensured %d colony DB(s) at startup", len(ensured))
except Exception:
logger.warning("progress_db: backfill at startup failed (non-fatal)", exc_info=True)
def build_llm(self, model: str | None = None):
"""Construct an LLM provider using the server's configured defaults."""
from framework.config import RuntimeConfig, get_hive_config
rc = RuntimeConfig(model=model or self._model or RuntimeConfig().model)
llm_config = get_hive_config().get("llm", {})
if llm_config.get("use_antigravity_subscription"):
from framework.llm.antigravity import AntigravityProvider
return AntigravityProvider(model=rc.model)
from framework.llm.litellm import LiteLLMProvider
return LiteLLMProvider(
model=rc.model,
api_key=rc.api_key,
api_base=rc.api_base,
**rc.extra_kwargs,
)
# ------------------------------------------------------------------
# Session lifecycle
# ------------------------------------------------------------------
async def _create_session_core(
self,
session_id: str | None = None,
model: str | None = None,
) -> Session:
"""Create session infrastructure (EventBus, LLM) without starting queen.
Internal helper — use create_session() or create_session_with_worker_colony().
"""
from framework.host.event_bus import EventBus
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
resolved_id = session_id or f"session_{ts}_{uuid.uuid4().hex[:8]}"
async with self._lock:
if resolved_id in self._sessions:
raise ValueError(f"Session '{resolved_id}' already exists")
# Session owns these — shared with queen and worker
llm = self.build_llm(model=model)
event_bus = EventBus()
session = Session(
id=resolved_id,
event_bus=event_bus,
llm=llm,
loaded_at=time.time(),
)
async with self._lock:
self._sessions[resolved_id] = session
return session
def _resume_queen_name(self, session_id: str) -> str | None:
"""Best-effort queen identity lookup for a persisted session."""
session_dir = _find_queen_session_dir(session_id)
if not session_dir.exists():
return None
meta_path = session_dir / "meta.json"
if meta_path.exists():
try:
meta = json.loads(meta_path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
meta = {}
queen_id = meta.get("queen_id")
if isinstance(queen_id, str) and queen_id.strip():
return queen_id.strip()
if session_dir.parent.name == "sessions":
queen_id = session_dir.parent.parent.name
if queen_id:
return queen_id
return None
async def _ensure_session_queen_identity(
self,
session: Session,
initial_prompt: str | None = None,
) -> dict:
"""Resolve the queen identity and return the loaded profile.
Sets ``session.queen_name`` and returns the validated profile dict.
The caller can pass the profile directly to the orchestrator without
re-loading from disk.
"""
from framework.agents.queen.queen_profiles import (
ensure_default_queens,
load_queen_profile,
select_queen,
)
ensure_default_queens()
candidates: list[str] = []
current_queen = (session.queen_name or "").strip()
if current_queen and current_queen != "default":
candidates.append(current_queen)
if session.queen_resume_from:
resumed_queen = self._resume_queen_name(session.queen_resume_from)
if resumed_queen and resumed_queen not in candidates:
candidates.append(resumed_queen)
for queen_id in candidates:
try:
profile = load_queen_profile(queen_id)
except FileNotFoundError:
logger.warning("Session '%s': queen profile '%s' not found", session.id, queen_id)
continue
session.queen_name = queen_id
return profile
selector_input = initial_prompt or ""
queen_id = await select_queen(selector_input, session.llm)
session.queen_name = queen_id
return load_queen_profile(queen_id)
async def create_session(
self,
session_id: str | None = None,
model: str | None = None,
initial_prompt: str | None = None,
queen_resume_from: str | None = None,
queen_name: str | None = None,
initial_phase: str | None = None,
) -> Session:
"""Create a new session with a queen but no worker.
When ``queen_resume_from`` is set the queen writes conversation messages
to that existing session's directory instead of creating a new one.
This preserves full conversation history across server restarts.
When ``queen_name`` is set the session is pre-bound to that queen
identity, skipping LLM auto-selection in the identity hook.
"""
# Reuse the original session ID when cold-restoring
resolved_session_id = queen_resume_from or session_id
session = await self._create_session_core(session_id=resolved_session_id, model=model)
session.queen_resume_from = queen_resume_from
if queen_name:
session.queen_name = queen_name
# Start queen immediately (queen-only, no worker tools yet)
await self._start_queen(
session,
worker_identity=None,
initial_prompt=initial_prompt,
initial_phase=initial_phase,
)
logger.info(
"Session '%s' created (queen-only, resume_from=%s)",
session.id,
queen_resume_from,
)
return session
async def create_session_with_worker_colony(
self,
agent_path: str | Path,
agent_id: str | None = None,
session_id: str | None = None,
model: str | None = None,
initial_prompt: str | None = None,
queen_resume_from: str | None = None,
queen_name: str | None = None,
initial_phase: str | None = None,
worker_name: str | None = None,
) -> Session:
"""Create a session and load a worker in one step.
When ``worker_name`` is provided, creates a worker-only session
(no queen) — the worker runs as the primary interactive loop.
Otherwise, creates a queen+worker session (legacy path).
"""
agent_path = Path(agent_path)
resolved_colony_id = agent_id or agent_path.name
if worker_name:
return await self._create_worker_only_session(
agent_path=agent_path,
worker_name=worker_name,
colony_id=resolved_colony_id,
session_id=session_id,
model=model,
initial_prompt=initial_prompt,
queen_resume_from=queen_resume_from,
queen_name=queen_name,
)
from framework.tools.queen_lifecycle_tools import build_worker_profile
agent_path = Path(agent_path)
resolved_colony_id = agent_id or agent_path.name
# Read colony metadata.json for queen provenance (queen_name,
# queen_session_id) so we can restore the correct queen identity
# and resume from the originating session when no explicit
# queen_resume_from was provided.
_colony_metadata: dict = {}
_colony_metadata_path = agent_path / "metadata.json"
if _colony_metadata_path.exists():
try:
_colony_metadata = json.loads(_colony_metadata_path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
pass
if not queen_name:
queen_name = _colony_metadata.get("queen_name") or None
# Colony metadata's queen_session_id is the authoritative session
# for this colony (the forked session). It takes priority over
# whatever the frontend found via history scan, which may be the
# source session instead of the fork.
_colony_session_id = _colony_metadata.get("queen_session_id")
if _colony_session_id:
queen_resume_from = _colony_session_id
elif not queen_resume_from:
queen_resume_from = None
# When cold-restoring, check meta.json for the phase — if the agent
# was still being built we must NOT try to load the worker (the code
# is incomplete and will fail to import).
_resume_queen_id: str | None = None
if queen_resume_from:
_resume_phase = None
_meta_path = _find_queen_session_dir(queen_resume_from) / "meta.json"
if _meta_path.exists():
try:
_meta = json.loads(_meta_path.read_text(encoding="utf-8"))
_resume_phase = _meta.get("phase")
_resume_queen_id = _meta.get("queen_id")
except (json.JSONDecodeError, OSError):
pass
if _resume_phase in ("building", "planning"):
# Fall back to queen-only session — cold resume handler in
# _start_queen will set phase_state.agent_path and switch to
# the correct phase.
return await self.create_session(
session_id=session_id,
model=model,
initial_prompt=initial_prompt,
queen_resume_from=queen_resume_from,
queen_name=queen_name or _resume_queen_id,
)
# Use the colony's forked session ID as the live session ID.
# If it's already live (user navigated back), return it directly
# -- but only if it belongs to this colony.
if queen_resume_from and queen_resume_from in self._sessions:
existing = self._sessions[queen_resume_from]
if existing.worker_path and str(existing.worker_path) == str(agent_path):
return existing
# When the queen forked this colony, the inherited DM transcript
# is compacted in the background (see fork_session_into_colony).
# Block here until that compactor finishes so _load_worker_core
# reads the compacted summary — not the raw transcript (which
# would defeat the fork's purpose). Bounded wait: on timeout we
# proceed anyway so a stuck compactor can't brick the colony.
if queen_resume_from:
try:
from framework.server import compaction_status
await compaction_status.await_completion(
_find_queen_session_dir(queen_resume_from),
timeout=180.0,
)
except Exception:
logger.debug(
"await_compaction failed for %s — proceeding",
queen_resume_from,
exc_info=True,
)
session = await self._create_session_core(
session_id=_colony_session_id or queen_resume_from,
model=model,
)
session.queen_resume_from = queen_resume_from
if queen_name:
session.queen_name = queen_name
elif _resume_queen_id:
session.queen_name = _resume_queen_id
try:
# Load the colony FIRST (before queen) so queen gets full tools
await self._load_worker_core(
session,
agent_path,
colony_id=resolved_colony_id,
model=model,
)
# Restore active triggers from persisted state (cold restore)
await self._restore_active_triggers(session, session.id)
# Start queen with worker profile + lifecycle + monitoring tools
worker_identity = (
build_worker_profile(session.colony_runtime, agent_path=agent_path) if session.colony_runtime else None
)
await self._start_queen(
session,
worker_identity=worker_identity,
initial_prompt=initial_prompt,
initial_phase=initial_phase,
)
except Exception:
if queen_resume_from:
# Cold restore: worker load failed (e.g. incomplete code from a
# building session, or the colony directory was deleted). Fall
# back to queen-only so the user can continue the conversation.
# Forward queen_name so the recovered session is stored under
# the correct queen identity -- otherwise it lands in default/
# and the frontend routes the user to the wrong dir.
logger.warning(
"Cold restore: worker load failed for '%s', falling back to queen-only",
agent_path,
exc_info=True,
)
await self.stop_session(session.id)
return await self.create_session(
session_id=session_id,
model=model,
initial_prompt=initial_prompt,
queen_resume_from=queen_resume_from,
queen_name=queen_name or _resume_queen_id,
)
# If anything fails (non-cold-restore), tear down the session
await self.stop_session(session.id)
raise
return session
async def _create_worker_only_session(
self,
agent_path: Path,
worker_name: str,
colony_id: str,
session_id: str | None = None,
model: str | None = None,
initial_prompt: str | None = None,
queen_resume_from: str | None = None,
queen_name: str | None = None,
) -> Session:
"""Create a worker-only session (no queen).
Loads the worker's {worker_name}.json config, creates an AgentLoop,
and sets it as the primary interactive executor so chat/SSE work
through the existing infrastructure.
"""
import json as _json
import shutil
from framework.agent_loop.agent_loop import AgentLoop, LoopConfig
from framework.agent_loop.types import AgentContext, AgentSpec
from framework.orchestrator.graph_executor import GraphExecutor
from framework.schemas.goal import Goal
from framework.storage.conversation_store import FileConversationStore
from framework.tracker.decision_tracker import DecisionTracker
worker_config_path = agent_path / f"{worker_name}.json"
if not worker_config_path.exists():
raise FileNotFoundError(f"Worker config not found: {worker_config_path}")
worker_data = _json.loads(worker_config_path.read_text(encoding="utf-8"))
session = await self._create_session_core(
session_id=queen_resume_from,
model=model,
)
session.queen_resume_from = queen_resume_from
if queen_name:
session.queen_name = queen_name
session.colony_id = colony_id
session.colony_name = colony_id
session.worker_path = agent_path
# Worker storage: ~/.hive/agents/{colony_name}/{worker_name}/
worker_storage = Path.home() / ".hive" / "agents" / colony_id / worker_name
worker_storage.mkdir(parents=True, exist_ok=True)
# Copy conversations from colony if fresh
worker_conv_dir = worker_storage / "conversations"
if not worker_conv_dir.exists():
colony_conv = agent_path / "conversations"
if colony_conv.exists():
shutil.copytree(colony_conv, worker_conv_dir)
conversation_store = FileConversationStore(worker_conv_dir)
# Build AgentSpec from worker config
spec = AgentSpec(
id=worker_name,
name=worker_data.get("name", worker_name),
description=worker_data.get("description", ""),
system_prompt=worker_data.get("system_prompt", ""),
tools=worker_data.get("tools", []),
tool_access_policy="all",
identity_prompt=worker_data.get("identity_prompt", ""),
)
# Build loop config
lc_data = worker_data.get("loop_config", {})
loop_config = LoopConfig(
max_iterations=lc_data.get("max_iterations", 999_999),
max_tool_calls_per_turn=lc_data.get("max_tool_calls_per_turn", 30),
max_context_tokens=lc_data.get("max_context_tokens", 180_000),
spillover_dir=str(agent_path / "data"),
)
# Build goal
goal_data = worker_data.get("goal", {})
goal = Goal(
id=f"{colony_id}-{worker_name}",
name=goal_data.get("description", worker_name)[:60],
description=goal_data.get("description", ""),
)
# Queen dir for SSE/session metadata (reuse queen session storage)
storage_session_id = queen_resume_from or session.id
queen_dir = _queen_session_dir(storage_session_id, session.queen_name)
queen_dir.mkdir(parents=True, exist_ok=True)
session.queen_dir = queen_dir
# Write meta
_meta_path = queen_dir / "meta.json"
_existing_meta: dict = {}
if _meta_path.exists():
try:
_existing_meta = _json.loads(_meta_path.read_text(encoding="utf-8"))
except Exception:
pass
_existing_meta.update(
{
"created_at": time.time(),
"queen_id": session.queen_name,
"agent_name": worker_name,
"agent_path": str(agent_path),
"worker_name": worker_name,
}
)
_meta_path.write_text(_json.dumps(_existing_meta), encoding="utf-8")
# Set up event log
iteration_offset = 0
events_path = queen_dir / "events.jsonl"
if events_path.exists():
max_iter = -1
with open(events_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
evt = _json.loads(line)
i = evt.get("iteration", 0)
if i > max_iter:
max_iter = i
except Exception:
pass
if max_iter >= 0:
iteration_offset = max_iter + 1
# Load the worker via AgentLoader to get the full pipeline (MCP, skills, creds)
from framework.loader import AgentLoader
loop = asyncio.get_running_loop()
runner = await loop.run_in_executor(
None,
lambda: AgentLoader.load(
agent_path,
model=model or self._model,
interactive=False,
skip_credential_validation=True,
credential_store=self._credential_store,
),
)
if runner._agent_runtime is None:
await loop.run_in_executor(
None,
lambda: runner._setup(event_bus=session.event_bus),
)
session.colony_runtime = runner._agent_runtime
session.runner = runner
# Start the AgentHost
runtime = runner._agent_runtime
if runtime and not runtime.is_running:
await runtime.start()
# Register entry point so we can trigger execution
from framework.host.execution_manager import EntryPointSpec
if not runtime._streams:
runtime.register_entry_point(
EntryPointSpec(
id="default",
name="Default",
entry_node=worker_name,
trigger_type="manual",
isolation_level="shared",
),
)
# Create a queen-like executor for the worker so chat injection works
# We reuse the queen_executor field even though it's a worker
queen_registry = runner._tool_registry
# Start with queen's default tools if available
queen_llm = runner._llm or session.llm
all_tools = list(queen_registry.get_tools().values())
tool_executor = queen_registry.get_executor()
agent_loop = AgentLoop(
event_bus=session.event_bus,
config=loop_config,
tool_executor=tool_executor,
conversation_store=conversation_store,
)
worker_ctx = AgentContext(
runtime=DecisionTracker(worker_storage),
agent_id=worker_name,
agent_spec=spec,
input_data={"task": goal_data.get("description", "")},
llm=queen_llm,
available_tools=all_tools,
goal_context=goal.to_prompt_context(),
goal=goal,
max_tokens=8192,
stream_id=worker_name,
execution_id=worker_name,
identity_prompt=worker_data.get("identity_prompt", ""),
memory_prompt=worker_data.get("memory_prompt", ""),
skills_catalog_prompt=worker_data.get("skills_catalog_prompt", ""),
protocols_prompt=worker_data.get("protocols_prompt", ""),
skill_dirs=worker_data.get("skill_dirs", []),
)
session.queen_executor = GraphExecutor(
node_id=worker_name,
agent_loop=agent_loop,
context=worker_ctx,
event_bus=session.event_bus,
)
# Start the worker's agent loop in the background.
# Scope browser profile per-session so parallel sessions drive
# independent Chrome tab groups. Browser tools live in an MCP
# subprocess; we inject `profile` via the ToolRegistry execution
# context (a CONTEXT_PARAM) so it flows into every tool call.
async def _run_worker():
try:
from framework.loader.tool_registry import ToolRegistry
ToolRegistry.set_execution_context(profile=session.id)
except Exception:
logger.debug("Worker: failed to set browser profile", exc_info=True)
await session.queen_executor.run(initial_message=initial_prompt)
session.queen_task = asyncio.create_task(_run_worker())
# Set up event persistence
if session.event_bus and queen_dir:
session.event_bus.start_persistence(queen_dir, iteration_offset=iteration_offset)
logger.info(
"Worker-only session '%s' started: colony=%s worker=%s tools=%d",
session.id,
colony_id,
worker_name,
len(all_tools),
)
async with self._lock:
self._loading.discard(session.id)
return session
# ------------------------------------------------------------------
# Worker lifecycle
# ------------------------------------------------------------------
async def _load_worker_core(
self,
session: Session,
agent_path: str | Path,
colony_id: str | None = None,
model: str | None = None,
) -> None:
"""Load a worker into a session (core logic).
Sets up the runner, runtime, and session fields. Does NOT notify
the queen — callers handle that step.
"""
from framework.loader import AgentLoader
agent_path = Path(agent_path)
resolved_colony_id = colony_id or agent_path.name
if session.colony_runtime is not None:
raise ValueError(f"Session '{session.id}' already has colony '{session.colony_id}'")
async with self._lock:
if session.id in self._loading:
raise ValueError(f"Session '{session.id}' is currently loading a colony")
self._loading.add(session.id)
try:
# Blocking I/O — load in executor
loop = asyncio.get_running_loop()
# By default, workers share the session's LLM with the queen so
# execution and memory reflection/recall stay on the same model.
session_model = getattr(session.llm, "model", None)
resolved_model = model or session_model or self._model
runner = await loop.run_in_executor(
None,
lambda: AgentLoader.load(
agent_path,
model=resolved_model,
interactive=False,
skip_credential_validation=True,
credential_store=self._credential_store,
),
)
if model is None:
runner._llm = session.llm
# Setup with session's event bus
if runner._agent_runtime is None:
await loop.run_in_executor(
None,
lambda: runner._setup(event_bus=session.event_bus),
)
runtime = runner._agent_runtime
# Load triggers from the agent's triggers.json definition file.
# triggers.json is written exclusively by set_trigger, so the
# presence of an entry means the user explicitly activated this
# trigger in a previous session. We treat the file as the
# source of truth and auto-start each trigger on colony load
# so the user doesn't have to re-activate after every restart.
# The per-session active_triggers tracking still functions, but
# is no longer the only path to "running" status.
from framework.tools.queen_lifecycle_tools import (
_read_agent_triggers_json,
_start_trigger_timer,
_start_trigger_webhook,
)
triggers_to_autostart: list[str] = []
for tdata in _read_agent_triggers_json(agent_path):
tid = tdata.get("id", "")
ttype = tdata.get("trigger_type", "")
if tid and ttype in ("timer", "webhook"):
session.available_triggers[tid] = TriggerDefinition(
id=tid,
trigger_type=ttype,
trigger_config=tdata.get("trigger_config", {}),
description=tdata.get("name", tid),
task=tdata.get("task", ""),
)
triggers_to_autostart.append(tid)
logger.info("Loaded trigger '%s' (%s) from triggers.json", tid, ttype)
# Auto-start every trigger discovered in triggers.json. The
# frontend listens for TRIGGER_ACTIVATED to render the active
# state; per-session active_triggers tracking still happens
# via _persist_active_triggers below.
for tid in triggers_to_autostart:
tdef = session.available_triggers[tid]
try:
if tdef.trigger_type == "timer":
await _start_trigger_timer(session, tid, tdef)
elif tdef.trigger_type == "webhook":
await _start_trigger_webhook(session, tid, tdef)
tdef.active = True
session.active_trigger_ids.add(tid)
logger.info("Auto-started trigger '%s' on colony load", tid)
except Exception:
logger.warning(
"Failed to auto-start trigger '%s' on colony load",
tid,
exc_info=True,
)
if session.active_trigger_ids:
# Persist the auto-started set so a subsequent restart
# finds them in state.active_triggers and the existing
# _restore_active_triggers path also keeps working.
from framework.tools.queen_lifecycle_tools import (
_persist_active_triggers,
)
await _persist_active_triggers(session, session.id)
if session.available_triggers:
# Emit AVAILABLE for every trigger (so the UI knows the
# definition exists) and ACTIVATED for the ones we just
# auto-started. The frontend handler treats them as the
# same case and uses the latter to flip the card to
# active.
await self._emit_trigger_events(session, "available", session.available_triggers)
if session.active_trigger_ids:
activated = {
tid: session.available_triggers[tid]
for tid in session.active_trigger_ids
if tid in session.available_triggers
}
if activated:
await self._emit_trigger_events(session, "activated", activated)
# Start runtime on event loop
if runtime and not runtime.is_running:
await runtime.start()
# Clean up stale "active" sessions from previous (dead) processes
self._cleanup_stale_active_sessions(agent_path)
info = runner.info()
# Update session
session.colony_id = resolved_colony_id
session.worker_path = agent_path
session.runner = runner
session.colony_runtime = runtime
session.worker_info = info
async with self._lock:
self._loading.discard(session.id)
logger.info(
"Worker '%s' loaded into session '%s'",
resolved_colony_id,
session.id,
)
except Exception:
async with self._lock:
self._loading.discard(session.id)
raise
def _cleanup_stale_active_sessions(self, agent_path: Path) -> None:
"""Mark stale 'active' sessions on disk as 'cancelled'.
When a new runtime starts, any on-disk session still marked 'active'
is from a process that no longer exists. 'Paused' sessions are left
intact so they remain resumable.
Two-layer protection against corrupting live sessions:
1. In-memory: skip any session ID currently tracked in self._sessions
(guaranteed alive in this process).
2. PID validation: if state.json contains a ``pid`` field, check whether
that process is still running on the host. If it is, the session is
owned by another healthy worker process, so leave it alone.
"""
sessions_path = Path.home() / ".hive" / "agents" / agent_path.name / "sessions"
if not sessions_path.exists():
return
live_session_ids = set(self._sessions.keys())
for d in sessions_path.iterdir():
if not d.is_dir() or not d.name.startswith("session_"):
continue
state_path = d / "state.json"
if not state_path.exists():
continue
try:
state = json.loads(state_path.read_text(encoding="utf-8"))
if state.get("status") != "active":
continue
# Layer 1: skip sessions that are alive in this process
session_id = state.get("session_id", d.name)
if session_id in live_session_ids or d.name in live_session_ids:
logger.debug(
"Skipping live in-memory session '%s' during stale cleanup",
d.name,
)
continue
# Layer 2: skip sessions whose owning process is still alive
recorded_pid = state.get("pid")
if recorded_pid is not None and self._is_pid_alive(recorded_pid):
logger.debug(
"Skipping session '%s' — owning process %d is still running",
d.name,
recorded_pid,
)
continue
state["status"] = "cancelled"
state.setdefault("result", {})["error"] = "Stale session: runtime restarted"
state.setdefault("timestamps", {})["updated_at"] = datetime.now().isoformat()
state_path.write_text(json.dumps(state, indent=2), encoding="utf-8")
logger.info("Marked stale session '%s' as cancelled for agent '%s'", d.name, agent_path.name)
except (json.JSONDecodeError, OSError) as e:
logger.warning("Failed to clean up stale session %s: %s", d.name, e)
@staticmethod
def _is_pid_alive(pid: int) -> bool:
"""Check whether a process with the given PID is still running."""
import os
import platform
if platform.system() == "Windows":
import ctypes
# PROCESS_QUERY_LIMITED_INFORMATION = 0x1000
kernel32 = ctypes.windll.kernel32
handle = kernel32.OpenProcess(0x1000, False, pid)
if not handle:
# 5 is ERROR_ACCESS_DENIED, meaning the process exists but is protected
return kernel32.GetLastError() == 5
exit_code = ctypes.c_ulong()
kernel32.GetExitCodeProcess(handle, ctypes.byref(exit_code))
kernel32.CloseHandle(handle)
# 259 is STILL_ACTIVE
return exit_code.value == 259
else:
try:
os.kill(pid, 0)
except OSError:
return False
return True
async def _restore_active_triggers(self, session: "Session", session_id: str) -> None:
"""Restore previously active triggers from persisted session state.
Called after worker loading to restart any timer/webhook triggers
that were active before a server restart.
"""
if not session.available_triggers or not session.colony_runtime:
return
try:
store = session.colony_runtime._session_store
state = await store.read_state(session_id)
if state and state.active_triggers:
from framework.host.event_bus import AgentEvent, EventType
from framework.tools.queen_lifecycle_tools import (
_start_trigger_timer,
_start_trigger_webhook,
)
runner = getattr(session, "runner", None)
colony_entry = runner.graph.entry_node if runner else None
saved_tasks = getattr(state, "trigger_tasks", {}) or {}
for tid in state.active_triggers:
tdef = session.available_triggers.get(tid)
if tdef:
# Restore user-configured task override
saved_task = saved_tasks.get(tid, "")
if saved_task:
tdef.task = saved_task
tdef.active = True
session.active_trigger_ids.add(tid)
if tdef.trigger_type == "timer":
await _start_trigger_timer(session, tid, tdef)
logger.info("Restored trigger timer '%s'", tid)
elif tdef.trigger_type == "webhook":
await _start_trigger_webhook(session, tid, tdef)
logger.info("Restored webhook trigger '%s'", tid)
# Emit TRIGGER_ACTIVATED so the frontend knows this
# trigger is running after a server restart. Without
# this, the previously-available event is the only
# signal the UI ever gets, and the trigger appears
# inactive forever.
if session.event_bus:
await session.event_bus.publish(
AgentEvent(
type=EventType.TRIGGER_ACTIVATED,
stream_id="queen",
data={
"trigger_id": tdef.id,
"trigger_type": tdef.trigger_type,
"trigger_config": tdef.trigger_config,
"name": tdef.description or tdef.id,
**({"entry_node": colony_entry} if colony_entry else {}),
},
)
)
else:
logger.warning(
"Saved trigger '%s' not found in worker entry points, skipping",
tid,
)
# Restore worker_configured flag
if state and getattr(state, "worker_configured", False):
session.worker_configured = True
except Exception as e:
logger.warning("Failed to restore active triggers: %s", e)
async def load_colony(
self,
session_id: str,
agent_path: str | Path,
colony_id: str | None = None,
model: str | None = None,
) -> Session:
"""Load a worker colony into an existing session (with running queen).
Starts the colony runtime and notifies the queen.
"""
agent_path = Path(agent_path)
session = self._sessions.get(session_id)
if session is None:
raise ValueError(f"Session '{session_id}' not found")
await self._load_worker_core(
session,
agent_path,
colony_id=colony_id,
model=model,
)
# Notify queen about the loaded worker (skip for queen itself).
if agent_path.name != "queen" and session.colony_runtime:
await self._notify_queen_colony_loaded(session)
# Update meta.json so cold-restore can discover this session by agent_path
storage_session_id = session.queen_resume_from or session.id
meta_path = _queen_session_dir(storage_session_id, session.queen_name) / "meta.json"
try:
_agent_name = (
session.worker_info.name if session.worker_info else str(agent_path.name).replace("_", " ").title()
)
existing_meta = {}
if meta_path.exists():
existing_meta = json.loads(meta_path.read_text(encoding="utf-8"))
existing_meta["agent_name"] = _agent_name
existing_meta["agent_path"] = str(session.worker_path) if session.worker_path else str(agent_path)
meta_path.write_text(json.dumps(existing_meta), encoding="utf-8")
except OSError:
pass
await self._restore_active_triggers(session, session_id)
# Emit SSE event so the frontend can update UI
await self._emit_colony_loaded(session)
return session
async def unload_colony(self, session_id: str) -> bool:
"""Unload the worker colony from a session. Queen stays alive."""
session = self._sessions.get(session_id)
if session is None:
return False
if session.colony_runtime is None:
return False
# Cleanup worker
if session.runner:
try:
await session.runner.cleanup_async()
except Exception as e:
logger.error("Error cleaning up colony '%s': %s", session.colony_id, e)
# Cancel active trigger timers
for tid, task in session.active_timer_tasks.items():
task.cancel()
logger.info("Cancelled trigger timer '%s' on unload", tid)
session.active_timer_tasks.clear()
# Unsubscribe webhook handlers (server stays alive — queen-owned)
for sub_id in session.active_webhook_subs.values():
try:
session.event_bus.unsubscribe(sub_id)
except Exception:
pass
session.active_webhook_subs.clear()
session.active_trigger_ids.clear()
# Clean up triggers
if session.available_triggers:
await self._emit_trigger_events(session, "removed", session.available_triggers)
session.available_triggers.clear()
colony_id = session.colony_id
session.colony_id = None
session.worker_path = None
session.runner = None
session.colony_runtime = None
session.worker_info = None
# Notify queen
await self._notify_queen_worker_unloaded(session)
logger.info("Colony '%s' unloaded from session '%s'", colony_id, session_id)
return True
# ------------------------------------------------------------------
# Session teardown
# ------------------------------------------------------------------
async def stop_session(self, session_id: str) -> bool:
"""Stop a session entirely — unload worker + cancel queen."""
async with self._lock:
session = self._sessions.pop(session_id, None)
if session is None:
return False
if session.worker_handoff_sub is not None:
try:
session.event_bus.unsubscribe(session.worker_handoff_sub)
except Exception:
pass
session.worker_handoff_sub = None
# Stop memory reflection/recall subscriptions
for sub_id in session.memory_reflection_subs:
try:
session.event_bus.unsubscribe(sub_id)
except Exception:
pass
session.memory_reflection_subs.clear()
# Run a final shutdown reflection so recent conversation insights
# are persisted before the session is destroyed (fire-and-forget).
if session.queen_dir is not None:
try:
from framework.agents.queen.queen_memory_v2 import (
global_memory_dir,
queen_memory_dir,
)
from framework.agents.queen.reflection_agent import run_shutdown_reflection
global_mem_dir = global_memory_dir()
queen_mem_dir = queen_memory_dir(session.queen_name)
if session.phase_state is not None:
global_mem_dir = session.phase_state.global_memory_dir or global_mem_dir
queen_mem_dir = session.phase_state.queen_memory_dir or queen_mem_dir
task = asyncio.create_task(
asyncio.shield(
run_shutdown_reflection(
session.queen_dir,
session.llm,
global_memory_dir_override=global_mem_dir,
queen_memory_dir=queen_mem_dir,
queen_id=session.queen_name,
)
),
name=f"shutdown-reflect-{session_id}",
)
logger.info("Session '%s': shutdown reflection spawned", session_id)
self._background_tasks.add(task)
task.add_done_callback(self._background_tasks.discard)
except Exception:
logger.warning("Session '%s': failed to spawn shutdown reflection", session_id, exc_info=True)
if session.queen_task is not None:
session.queen_task.cancel()
session.queen_task = None
session.queen_executor = None
# Cancel active trigger timers
for task in session.active_timer_tasks.values():
task.cancel()
session.active_timer_tasks.clear()
# Unsubscribe webhook handlers and stop queen webhook server
for sub_id in session.active_webhook_subs.values():
try:
session.event_bus.unsubscribe(sub_id)
except Exception:
pass
session.active_webhook_subs.clear()
if session.queen_webhook_server is not None:
try:
await session.queen_webhook_server.stop()
except Exception:
logger.error("Error stopping queen webhook server", exc_info=True)
session.queen_webhook_server = None
# Cleanup worker
if session.runner:
try:
await session.runner.cleanup_async()
except Exception as e:
logger.error("Error cleaning up worker: %s", e)
# Stop the unified ColonyRuntime (Phase 2 wiring) if it was started
if session.colony is not None:
try:
await session.colony.stop()
except Exception:
logger.warning(
"Session '%s': error stopping unified ColonyRuntime",
session_id,
exc_info=True,
)
session.colony = None
# Close per-session event log
session.event_bus.close_session_log()
logger.info("Session '%s' stopped", session_id)
return True
# ------------------------------------------------------------------
# Queen startup
# ------------------------------------------------------------------
def _subscribe_worker_handoffs(self, session: Session, executor: Any) -> None:
"""Deprecated — colony-scoped escalation routing lives in queen_orchestrator.
Kept as a shim so any legacy caller is a no-op. The real subscription
is installed by ``queen_orchestrator.create_queen`` via
``colony_runtime.subscribe_to_events(..., filter_colony=...)`` so that
cross-colony leakage is impossible and every handoff carries the
worker_id + request_id the queen needs to reply with addressed intent.
"""
return None
async def _start_queen(
self,
session: Session,
worker_identity: str | None,
initial_prompt: str | None = None,
initial_phase: str | None = None,
) -> None:
"""Start the queen executor for a session.
When ``session.queen_resume_from`` is set, queen conversation messages
are written to the ORIGINAL session's directory so the full conversation
history accumulates in one place across server restarts.
"""
from framework.server.queen_orchestrator import create_queen
logger.debug(
"[_start_queen] Starting for session %s, current queen_executor=%s",
session.id,
session.queen_executor,
)
queen_profile = await self._ensure_session_queen_identity(session, initial_prompt)
# Determine which session directory to use for queen storage.
# When queen_resume_from is set we write to the ORIGINAL session's
# directory so that all messages accumulate in one place.
storage_session_id = session.queen_resume_from or session.id
queen_dir = _queen_session_dir(storage_session_id, session.queen_name)
queen_dir.mkdir(parents=True, exist_ok=True)
session.queen_dir = queen_dir
# Always write/update session metadata so history sidebar has correct
# agent name, path, and last-active timestamp (important so the original
# session directory sorts as "most recent" after a cold-restore resume).
_meta_path = queen_dir / "meta.json"
try:
_agent_name = (
session.worker_info.name
if session.worker_info
else (str(session.worker_path.name).replace("_", " ").title() if session.worker_path else None)
)
# Merge into existing meta.json to preserve fields written by
# _update_meta_json (e.g. phase, agent_path set during building).
_existing_meta: dict = {}
if _meta_path.exists():
try:
_existing_meta = json.loads(_meta_path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
pass
_new_meta: dict = {
"created_at": time.time(),
"queen_id": session.queen_name,
}
if _agent_name is not None:
_new_meta["agent_name"] = _agent_name
if session.worker_path is not None:
_new_meta["agent_path"] = str(session.worker_path)
_existing_meta.update(_new_meta)
_meta_path.write_text(json.dumps(_existing_meta), encoding="utf-8")
# Hydrate colony-spawned lock state from meta.json so the lock
# survives server restart / cold-resume into a live session.
if _existing_meta.get("colony_spawned") is True:
session.colony_spawned = True
_spawned_name = _existing_meta.get("spawned_colony_name")
if isinstance(_spawned_name, str):
session.spawned_colony_name = _spawned_name
except OSError:
pass
# Enable per-session event persistence so that all eventbus events
# survive server restarts and can be replayed on cold-session resume.
# Scan the existing event log to find the max iteration ever written,
# then use max+1 as offset so resumed sessions produce monotonically
# increasing iteration values — preventing frontend message ID collisions.
iteration_offset = 0
last_phase = ""
events_path = queen_dir / "events.jsonl"
try:
if events_path.exists():
max_iter = -1
with open(events_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
evt = json.loads(line)
data = evt.get("data", {})
it = data.get("iteration")
if isinstance(it, int) and it > max_iter:
max_iter = it
# Track the latest queen phase from QUEEN_PHASE_CHANGED events
if evt.get("type") == "queen_phase_changed":
phase = data.get("phase")
if phase:
last_phase = phase
except (json.JSONDecodeError, TypeError):
continue
if max_iter >= 0:
iteration_offset = max_iter + 1
logger.info(
"Session '%s' resuming with iteration_offset=%d (from events.jsonl max), last phase: %s",
session.id,
iteration_offset,
last_phase or "unknown",
)
except OSError:
pass
session.event_bus.set_session_log(events_path, iteration_offset=iteration_offset)
logger.debug("[_start_queen] Calling create_queen...")
session.queen_task = await create_queen(
session=session,
session_manager=self,
worker_identity=worker_identity,
queen_dir=queen_dir,
queen_profile=queen_profile,
initial_prompt=initial_prompt,
initial_phase=initial_phase,
tool_registry=self._queen_tool_registry,
)
logger.debug(
"[_start_queen] create_queen returned, queen_task=%s, queen_executor=%s",
session.queen_task,
session.queen_executor,
)
# Phase 2 wiring: stand up a real ColonyRuntime that shares the
# queen's llm, tools, event bus, and storage path. In a DM session
# it has no parallel workers (the queen runs in queen_task), but
# the run_parallel_workers tool (Phase 4) will use this runtime
# as the spawn surface, and worker SUBAGENT_REPORT events flow
# back through the shared event_bus to the existing SSE.
try:
await self._start_unified_colony_runtime(session, queen_dir)
except Exception:
# ColonyRuntime is dormant infrastructure today — never let
# its construction abort queen startup. Phase 4 will harden.
logger.warning(
"_start_queen: unified ColonyRuntime construction failed",
exc_info=True,
)
# Auto-load worker on cold restore — the queen's conversation expects
# the colony to be loaded, but the new session has no worker.
if session.queen_resume_from and not session.colony_runtime:
meta_path = queen_dir / "meta.json"
if meta_path.exists():
try:
_meta = json.loads(meta_path.read_text(encoding="utf-8"))
_agent_path = _meta.get("agent_path")
if _agent_path and Path(_agent_path).exists():
await self.load_colony(session.id, _agent_path)
if session.phase_state:
# Restored colony session lands in reviewing — the
# queen summarises whatever the last run produced
# before the user decides what to do next.
await session.phase_state.switch_to_reviewing(source="auto")
logger.info("Cold restore: auto-loaded colony from %s", _agent_path)
except Exception:
logger.warning("Cold restore: failed to auto-load colony", exc_info=True)
# ------------------------------------------------------------------
# Phase 2: unified ColonyRuntime construction
# ------------------------------------------------------------------
async def _start_unified_colony_runtime(
self,
session: Session,
queen_dir: Path,
) -> None:
"""Build a real ColonyRuntime sharing the queen's resources.
This is the Phase 2 wiring. The ColonyRuntime is created with:
- ``llm`` → ``session.llm``
- ``event_bus`` → ``session.event_bus`` (so worker SUBAGENT_REPORT
and lifecycle events flow through the same bus the SSE handler
already subscribes to)
- ``tools`` → the queen's resolved tool list (stashed by
``create_queen`` on ``session._queen_tools``)
- ``storage_path`` → ``queen_dir`` (parallel workers will land
under ``{queen_dir}/workers/{worker_id}/`` thanks to
``ColonyRuntime.spawn``)
- ``colony_id`` → ``session.id``
The runtime is started but no overseer is attached — the queen
still runs as ``session.queen_task`` from ``create_queen``. This
is dormant fan-out infrastructure: ``run_parallel_workers``
(Phase 4) is what activates it.
"""
from framework.agent_loop.types import AgentSpec
from framework.host.colony_runtime import ColonyRuntime
from framework.schemas.goal import Goal
queen_tools = getattr(session, "_queen_tools", None) or []
queen_tool_executor = getattr(session, "_queen_tool_executor", None)
colony_spec = AgentSpec(
id="queen_colony",
name="Queen Colony",
description=(
"Unified colony runtime hosting the queen overseer and "
"any parallel workers spawned via run_parallel_workers."
),
system_prompt="",
tools=[t.name for t in queen_tools],
tool_access_policy="all",
)
colony_goal = Goal(
id=f"colony_goal_{session.id}",
name=f"Session {session.id}",
description="Default goal for the session-level ColonyRuntime.",
)
colony = ColonyRuntime(
agent_spec=colony_spec,
goal=colony_goal,
storage_path=queen_dir,
llm=session.llm,
tools=queen_tools,
tool_executor=queen_tool_executor,
event_bus=session.event_bus,
colony_id=session.id,
# Wire the on-disk colony name and queen id so
# ColonyRuntime auto-derives its override paths. DM sessions
# have no colony_name (session.colony_name is None), which
# keeps them out of the per-colony JSON store.
colony_name=getattr(session, "colony_name", None),
queen_id=getattr(session, "queen_name", None) or None,
pipeline_stages=[], # queen pipeline runs in queen_orchestrator, not here
)
# Per-colony tool allowlist, loaded from the colony's metadata.json
# when this session is attached to a real forked colony. For pure
# queen DM sessions (session.colony_name is None) we only capture
# the MCP-origin set — the allowlist stays ``None`` so every MCP
# tool passes through by default.
try:
mcp_tool_names_all: set[str] = set()
mgr_catalog = getattr(self, "_mcp_tool_catalog", None)
if isinstance(mgr_catalog, dict):
for entries in mgr_catalog.values():
for entry in entries:
name = entry.get("name") if isinstance(entry, dict) else None
if name:
mcp_tool_names_all.add(name)
enabled_mcp_tools: list[str] | None = None
colony_name = getattr(session, "colony_name", None)
if colony_name:
from framework.host.colony_metadata import load_colony_metadata
colony_meta = load_colony_metadata(colony_name)
raw = colony_meta.get("enabled_mcp_tools")
if raw is None or isinstance(raw, list):
enabled_mcp_tools = raw
colony.set_tool_allowlist(enabled_mcp_tools, mcp_tool_names_all)
except Exception:
logger.debug(
"Colony allowlist bootstrap failed for session %s",
session.id,
exc_info=True,
)
await colony.start()
session.colony = colony
logger.info(
"_start_queen: unified ColonyRuntime ready for session %s (%d tools, storage=%s)",
session.id,
len(queen_tools),
queen_dir,
)
# ------------------------------------------------------------------
# Queen notifications
# ------------------------------------------------------------------
async def _notify_queen_colony_loaded(self, session: Session) -> None:
"""Inject a system message into the queen about the loaded colony."""
from framework.tools.queen_lifecycle_tools import build_worker_profile
executor = session.queen_executor
if executor is None:
return
node = executor.node_registry.get("queen")
if node is None or not hasattr(node, "inject_event"):
return
profile = build_worker_profile(session.colony_runtime, agent_path=session.worker_path)
# Append available trigger info so the queen knows what's schedulable
trigger_lines = ""
if session.available_triggers:
parts = []
for t in session.available_triggers.values():
cfg = t.trigger_config
detail = cfg.get("cron") or f"every {cfg.get('interval_minutes', '?')} min"
task_info = f' -> task: "{t.task}"' if t.task else " (no task configured)"
parts.append(f" - {t.id} ({t.trigger_type}: {detail}){task_info}")
trigger_lines = "\n\nAvailable triggers (inactive — use set_trigger to activate):\n" + "\n".join(parts)
await node.inject_event(f"[SYSTEM] Colony loaded.{profile}{trigger_lines}")
async def _emit_colony_loaded(self, session: Session) -> None:
"""Publish a WORKER_COLONY_LOADED event so the frontend can update."""
from framework.host.event_bus import AgentEvent, EventType
info = session.worker_info
await session.event_bus.publish(
AgentEvent(
type=EventType.WORKER_COLONY_LOADED,
stream_id="queen",
data={
"colony_id": session.colony_id,
"colony_name": info.name if info else session.colony_id,
"agent_path": str(session.worker_path) if session.worker_path else "",
"goal": info.goal_name if info else "",
"node_count": info.node_count if info else 0,
},
)
)
async def _notify_queen_worker_unloaded(self, session: Session) -> None:
"""Notify the queen that the worker has been unloaded."""
executor = session.queen_executor
if executor is None:
return
node = executor.node_registry.get("queen")
if node is None or not hasattr(node, "inject_event"):
return
await node.inject_event(
"[SYSTEM] Worker unloaded. You are now operating independently. "
"Design or build the agent to solve the user's problem "
"according to your current phase."
)
async def _emit_trigger_events(
self,
session: Session,
kind: str,
triggers: dict[str, TriggerDefinition],
) -> None:
"""Emit TRIGGER_AVAILABLE / ACTIVATED / REMOVED events for each trigger."""
from framework.host.event_bus import AgentEvent, EventType
if kind == "activated":
event_type = EventType.TRIGGER_ACTIVATED
elif kind == "removed":
event_type = EventType.TRIGGER_REMOVED
else:
event_type = EventType.TRIGGER_AVAILABLE
# Resolve entry node for trigger target
runner = getattr(session, "runner", None)
colony_entry = runner.graph.entry_node if runner else None
fire_times = getattr(session, "trigger_next_fire", {})
fire_stats = getattr(session, "trigger_fire_stats", {})
now_mono = time.monotonic()
now_wall = time.time()
for t in triggers.values():
# Merge ephemeral next-fire data + historical fire stats into
# trigger_config so the UI can render a live-ticking countdown
# and a "fired Nx · last 2m ago" badge. `next_fire_at` is epoch
# milliseconds (wall clock) — the frontend anchors its ticker
# on this. `next_fire_in` is kept for legacy consumers.
config_out = dict(t.trigger_config)
mono = fire_times.get(t.id)
if mono is not None:
remaining = max(0.0, mono - now_mono)
config_out["next_fire_in"] = remaining
config_out["next_fire_at"] = int((now_wall + remaining) * 1000)
stats = fire_stats.get(t.id)
if stats:
config_out["fire_count"] = stats.get("fire_count", 0)
if stats.get("last_fired_at") is not None:
config_out["last_fired_at"] = stats["last_fired_at"]
await session.event_bus.publish(
AgentEvent(
type=event_type,
stream_id="queen",
data={
"trigger_id": t.id,
"trigger_type": t.trigger_type,
"trigger_config": config_out,
"name": t.description or t.id,
**({"entry_node": colony_entry} if colony_entry else {}),
},
)
)
async def revive_queen(self, session: Session) -> None:
"""Revive a dead queen executor on an existing session.
Restarts the queen with the same session context (worker profile, tools, etc.).
"""
from framework.tools.queen_lifecycle_tools import build_worker_profile
logger.debug(
"[revive_queen] Starting revival for session '%s', current queen_executor=%s",
session.id,
session.queen_executor,
)
# Build worker identity if worker is loaded
worker_identity = (
build_worker_profile(session.colony_runtime, agent_path=session.worker_path)
if session.colony_runtime
else None
)
logger.debug("[revive_queen] worker_identity=%s", "present" if worker_identity else "None")
# Start queen with existing session context
logger.debug("[revive_queen] Calling _start_queen...")
await self._start_queen(session, worker_identity=worker_identity)
logger.info(
"Queen revived for session '%s', new queen_executor=%s",
session.id,
session.queen_executor,
)
# ------------------------------------------------------------------
# Lookups
# ------------------------------------------------------------------
def get_session(self, session_id: str) -> Session | None:
return self._sessions.get(session_id)
def get_session_by_colony_id(self, colony_id: str) -> Session | None:
"""Find a session by its loaded colony's ID."""
for s in self._sessions.values():
if s.colony_id == colony_id:
return s
return None
def get_session_for_agent(self, agent_id: str) -> Session | None:
"""Resolve an agent_id to a session (backward compat).
Checks session.id first, then session.colony_id.
"""
s = self._sessions.get(agent_id)
if s:
return s
return self.get_session_by_colony_id(agent_id)
def is_loading(self, session_id: str) -> bool:
return session_id in self._loading
def list_sessions(self) -> list[Session]:
return list(self._sessions.values())
# ------------------------------------------------------------------
# Skill override helpers — used by routes_skills to find every live
# SkillsManager affected by a queen- or colony-scope mutation so a
# single HTTP call can reload them all.
# ------------------------------------------------------------------
def iter_queen_sessions(self, queen_id: str):
"""Yield live sessions whose queen matches ``queen_id``."""
for s in self._sessions.values():
if getattr(s, "queen_name", None) == queen_id:
yield s
def iter_colony_runtimes(
self,
*,
queen_id: str | None = None,
colony_name: str | None = None,
):
"""Yield live ``ColonyRuntime`` instances matching the filters.
``queen_id`` alone → every runtime whose ``queen_id`` matches
(useful when the user toggles a queen-scope skill — all her
colonies must reload). ``colony_name`` alone → the single
runtime pinned to that colony. Both → intersection. No filters
→ every live runtime (used by global ``/api/skills`` reload).
"""
for s in self._sessions.values():
colony = getattr(s, "colony", None)
if colony is None:
continue
if queen_id is not None and getattr(colony, "queen_id", None) != queen_id:
continue
if colony_name is not None and getattr(colony, "colony_name", None) != colony_name:
continue
yield colony
# ------------------------------------------------------------------
# Cold session helpers (disk-only, no live runtime required)
# ------------------------------------------------------------------
@staticmethod
def get_cold_session_info(session_id: str) -> dict | None:
"""Return disk metadata for a session that is no longer live in memory.
Checks whether queen conversation files exist at
~/.hive/agents/queens/{name}/sessions/{session_id}/conversations/. Returns None when
no data is found so callers can fall through to a 404.
"""
queen_dir = _find_queen_session_dir(session_id)
convs_dir = queen_dir / "conversations"
if not convs_dir.exists():
return None
# Check whether any message part files are actually present
has_messages = False
try:
# Flat layout: conversations/parts/*.json
flat_parts = convs_dir / "parts"
if flat_parts.exists() and any(f.suffix == ".json" for f in flat_parts.iterdir()):
has_messages = True
else:
# Node-based layout: conversations/<node_id>/parts/*.json
for node_dir in convs_dir.iterdir():
if not node_dir.is_dir() or node_dir.name == "parts":
continue
parts_dir = node_dir / "parts"
if parts_dir.exists() and any(f.suffix == ".json" for f in parts_dir.iterdir()):
has_messages = True
break
except OSError:
pass
try:
created_at = queen_dir.stat().st_ctime
except OSError:
created_at = 0.0
# Read extra metadata written at session start
agent_name: str | None = None
agent_path: str | None = None
colony_spawned: bool = False
spawned_colony_name: str | None = None
meta_path = queen_dir / "meta.json"
if meta_path.exists():
try:
meta = json.loads(meta_path.read_text(encoding="utf-8"))
agent_name = meta.get("agent_name")
agent_path = meta.get("agent_path")
created_at = meta.get("created_at") or created_at
colony_spawned = bool(meta.get("colony_spawned"))
_spawned = meta.get("spawned_colony_name")
if isinstance(_spawned, str):
spawned_colony_name = _spawned
except (json.JSONDecodeError, OSError):
pass
return {
"session_id": session_id,
"cold": True,
"live": False,
"has_messages": has_messages,
"created_at": created_at,
"agent_name": agent_name,
"agent_path": agent_path,
"colony_spawned": colony_spawned,
"spawned_colony_name": spawned_colony_name,
}
@staticmethod
def list_cold_sessions() -> list[dict]:
"""Return metadata for every queen session directory on disk, newest first."""
if not QUEENS_DIR.exists():
return []
# Collect session dirs from all queen identities
all_session_dirs: list[Path] = []
try:
for queen_dir in QUEENS_DIR.iterdir():
if not queen_dir.is_dir():
continue
sessions_dir = queen_dir / "sessions"
if sessions_dir.exists():
for d in sessions_dir.iterdir():
if d.is_dir():
all_session_dirs.append(d)
except OSError:
return []
results: list[dict] = []
for d in all_session_dirs:
if not d.is_dir():
continue
try:
created_at = d.stat().st_ctime
except OSError:
created_at = 0.0
agent_name: str | None = None
agent_path: str | None = None
meta_path = d / "meta.json"
meta: dict = {}
if meta_path.exists():
try:
meta = json.loads(meta_path.read_text(encoding="utf-8"))
agent_name = meta.get("agent_name")
agent_path = meta.get("agent_path")
created_at = meta.get("created_at") or created_at
except (json.JSONDecodeError, OSError):
pass
# Skip colony-forked sessions -- these belong to colonies,
# not to the queen DM history.
if meta.get("colony_fork"):
continue
# Build a quick preview of the last human/assistant exchange.
# We read all conversation parts, filter to client-facing messages,
# and return the last assistant message content as a snippet.
last_message: str | None = None
message_count: int = 0
# Last-activity timestamp — mtime of the latest client-facing message.
# Falls back to session creation time for empty sessions. NOTE: the
# session directory's own mtime is NOT reliable here — POSIX dir mtime
# only updates when direct entries change, and conversation parts are
# nested under conversations/parts/, so writing a new part does not
# bubble up to the session dir.
last_active_at: float = float(created_at) if isinstance(created_at, (int, float)) else 0.0
convs_dir = d / "conversations"
if convs_dir.exists():
try:
all_parts: list[dict] = []
def _collect_parts(parts_dir: Path, _dest: list[dict] = all_parts) -> None:
if not parts_dir.exists():
return
for part_file in sorted(parts_dir.iterdir()):
if part_file.suffix != ".json":
continue
try:
part = json.loads(part_file.read_text(encoding="utf-8"))
part.setdefault("created_at", part_file.stat().st_mtime)
_dest.append(part)
except (json.JSONDecodeError, OSError):
continue
# Flat layout: conversations/parts/*.json
_collect_parts(convs_dir / "parts")
# Node-based layout: conversations/<node_id>/parts/*.json
for node_dir in convs_dir.iterdir():
if not node_dir.is_dir() or node_dir.name == "parts":
continue
_collect_parts(node_dir / "parts")
# Filter to client-facing messages only
client_msgs = [
p
for p in all_parts
if not p.get("is_transition_marker")
and p.get("role") != "tool"
and not (p.get("role") == "assistant" and p.get("tool_calls"))
]
client_msgs.sort(key=lambda m: m.get("created_at", m.get("seq", 0)))
message_count = len(client_msgs)
# Take the latest message's timestamp as the activity marker.
# _collect_parts sets created_at via setdefault to the part
# file's mtime, so this is always a valid float.
if client_msgs:
latest_ts = client_msgs[-1].get("created_at")
if isinstance(latest_ts, (int, float)) and latest_ts > last_active_at:
last_active_at = float(latest_ts)
# Last assistant message as preview snippet
for msg in reversed(client_msgs):
content = msg.get("content") or ""
if isinstance(content, list):
# Anthropic-style content blocks
content = " ".join(
b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text"
)
if content and msg.get("role") == "assistant":
last_message = content[:120].strip()
break
except OSError:
pass
# Derive queen_id from directory structure: queens/{queen_id}/sessions/{session_id}
queen_id = d.parent.parent.name if d.parent.name == "sessions" else None
results.append(
{
"session_id": d.name,
"cold": True, # caller overrides for live sessions
"live": False,
"has_messages": convs_dir.exists() and message_count > 0,
"created_at": created_at,
"last_active_at": last_active_at,
"agent_name": agent_name,
"agent_path": agent_path,
"last_message": last_message,
"message_count": message_count,
"queen_id": queen_id,
}
)
# Sort by last-activity timestamp, newest first. This is the order
# callers (including /api/sessions/history and colony-chat cold resume)
# rely on — don't use raw directory mtime, which doesn't update when
# nested conversation parts are written.
results.sort(key=lambda r: r.get("last_active_at") or 0.0, reverse=True)
return results
async def shutdown_all(self) -> None:
"""Gracefully stop all sessions. Called on server shutdown."""
session_ids = list(self._sessions.keys())
for sid in session_ids:
await self.stop_session(sid)
logger.info("All sessions stopped")