1407 lines
57 KiB
Python
1407 lines
57 KiB
Python
"""Session-primary lifecycle manager for the HTTP API server.
|
|
|
|
Sessions (queen) are the primary entity. Workers are optional and can be
|
|
loaded/unloaded while the queen stays alive.
|
|
|
|
Architecture:
|
|
- Session owns EventBus + LLM, shared with queen and worker
|
|
- Queen is always present once a session starts
|
|
- Worker is optional — loaded into an existing session
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import time
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from framework.config import QUEENS_DIR
|
|
from framework.host.triggers import TriggerDefinition
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _queen_session_dir(session_id: str, queen_name: str = "default") -> Path:
|
|
"""Return the on-disk directory for a queen session."""
|
|
return QUEENS_DIR / queen_name / "sessions" / session_id
|
|
|
|
|
|
def _find_queen_session_dir(session_id: str) -> Path:
|
|
"""Search all queen directories for a session. Falls back to default."""
|
|
if QUEENS_DIR.exists():
|
|
for queen_dir in QUEENS_DIR.iterdir():
|
|
if not queen_dir.is_dir():
|
|
continue
|
|
candidate = queen_dir / "sessions" / session_id
|
|
if candidate.exists():
|
|
return candidate
|
|
return _queen_session_dir(session_id)
|
|
|
|
|
|
@dataclass
|
|
class Session:
|
|
"""A live session with a queen and optional worker."""
|
|
|
|
id: str
|
|
event_bus: Any # EventBus — owned by session
|
|
llm: Any # LLMProvider — owned by session
|
|
loaded_at: float
|
|
# Queen (always present once started)
|
|
queen_executor: Any = None # GraphExecutor for queen input injection
|
|
queen_task: asyncio.Task | None = None
|
|
# Loaded graph (optional)
|
|
graph_id: str | None = None
|
|
worker_path: Path | None = None
|
|
runner: Any | None = None # AgentRunner
|
|
graph_runtime: Any | None = None # AgentRuntime
|
|
worker_info: Any | None = None # AgentInfo
|
|
# Queen phase state (building/staging/running)
|
|
phase_state: Any = None # QueenPhaseState
|
|
# Worker handoff subscription
|
|
worker_handoff_sub: str | None = None
|
|
# Memory reflection + recall subscriptions (global memory)
|
|
memory_reflection_subs: list = field(default_factory=list) # list[str]
|
|
# Trigger definitions loaded from agent's triggers.json (available but inactive)
|
|
available_triggers: dict[str, TriggerDefinition] = field(default_factory=dict)
|
|
# Active trigger tracking (IDs currently firing + their asyncio tasks)
|
|
active_trigger_ids: set[str] = field(default_factory=set)
|
|
active_timer_tasks: dict[str, asyncio.Task] = field(default_factory=dict)
|
|
# Queen-owned webhook server (lazy singleton, created on first webhook trigger activation)
|
|
queen_webhook_server: Any = None
|
|
# EventBus subscription IDs for active webhook triggers (trigger_id -> sub_id)
|
|
active_webhook_subs: dict[str, str] = field(default_factory=dict)
|
|
# True after first successful worker execution (gates trigger delivery)
|
|
worker_configured: bool = False
|
|
# Monotonic timestamps for next trigger fire (mirrors AgentRuntime._timer_next_fire)
|
|
trigger_next_fire: dict[str, float] = field(default_factory=dict)
|
|
# Session directory resumption:
|
|
# When set, _start_queen writes queen conversations to this existing session's
|
|
# directory instead of creating a new one. This lets cold-restores accumulate
|
|
# all messages in the original session folder so history is never fragmented.
|
|
queen_resume_from: str | None = None
|
|
# Queen session directory (set during _start_queen, used for shutdown reflection)
|
|
queen_dir: Path | None = None
|
|
# Multi-queen support: which queen profile this session uses
|
|
queen_name: str = "default"
|
|
# Colony name: set when a worker is loaded from a colony
|
|
colony_name: str | None = None
|
|
|
|
|
|
class SessionManager:
|
|
"""Manages session lifecycles.
|
|
|
|
Thread-safe via asyncio.Lock. Workers are loaded via run_in_executor
|
|
(blocking I/O) then started on the event loop.
|
|
"""
|
|
|
|
def __init__(
|
|
self, model: str | None = None, credential_store=None, queen_tool_registry=None
|
|
) -> None:
|
|
self._sessions: dict[str, Session] = {}
|
|
self._loading: set[str] = set()
|
|
self._model = model
|
|
self._credential_store = credential_store
|
|
self._queen_tool_registry = queen_tool_registry
|
|
self._lock = asyncio.Lock()
|
|
# Strong references for fire-and-forget background tasks (e.g. shutdown
|
|
# reflections) so they aren't garbage-collected before completion.
|
|
self._background_tasks: set[asyncio.Task] = set()
|
|
|
|
# Run one-time v2 directory structure migration
|
|
from framework.storage.migrate_v2 import run_migration
|
|
|
|
try:
|
|
run_migration()
|
|
except Exception:
|
|
logger.warning("v2 migration failed (non-fatal)", exc_info=True)
|
|
|
|
def build_llm(self, model: str | None = None):
|
|
"""Construct an LLM provider using the server's configured defaults."""
|
|
from framework.config import RuntimeConfig, get_hive_config
|
|
|
|
rc = RuntimeConfig(model=model or self._model or RuntimeConfig().model)
|
|
llm_config = get_hive_config().get("llm", {})
|
|
if llm_config.get("use_antigravity_subscription"):
|
|
from framework.llm.antigravity import AntigravityProvider
|
|
|
|
return AntigravityProvider(model=rc.model)
|
|
|
|
from framework.llm.litellm import LiteLLMProvider
|
|
|
|
return LiteLLMProvider(
|
|
model=rc.model,
|
|
api_key=rc.api_key,
|
|
api_base=rc.api_base,
|
|
**rc.extra_kwargs,
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Session lifecycle
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _create_session_core(
|
|
self,
|
|
session_id: str | None = None,
|
|
model: str | None = None,
|
|
) -> Session:
|
|
"""Create session infrastructure (EventBus, LLM) without starting queen.
|
|
|
|
Internal helper — use create_session() or create_session_with_worker_graph().
|
|
"""
|
|
from framework.host.event_bus import EventBus
|
|
|
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
resolved_id = session_id or f"session_{ts}_{uuid.uuid4().hex[:8]}"
|
|
|
|
async with self._lock:
|
|
if resolved_id in self._sessions:
|
|
raise ValueError(f"Session '{resolved_id}' already exists")
|
|
|
|
# Session owns these — shared with queen and worker
|
|
llm = self.build_llm(model=model)
|
|
event_bus = EventBus()
|
|
|
|
session = Session(
|
|
id=resolved_id,
|
|
event_bus=event_bus,
|
|
llm=llm,
|
|
loaded_at=time.time(),
|
|
)
|
|
|
|
async with self._lock:
|
|
self._sessions[resolved_id] = session
|
|
|
|
return session
|
|
|
|
def _resume_queen_name(self, session_id: str) -> str | None:
|
|
"""Best-effort queen identity lookup for a persisted session."""
|
|
session_dir = _find_queen_session_dir(session_id)
|
|
if not session_dir.exists():
|
|
return None
|
|
|
|
meta_path = session_dir / "meta.json"
|
|
if meta_path.exists():
|
|
try:
|
|
meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
|
except (json.JSONDecodeError, OSError):
|
|
meta = {}
|
|
queen_id = meta.get("queen_id")
|
|
if isinstance(queen_id, str) and queen_id.strip():
|
|
return queen_id.strip()
|
|
|
|
if session_dir.parent.name == "sessions":
|
|
queen_id = session_dir.parent.parent.name
|
|
if queen_id:
|
|
return queen_id
|
|
return None
|
|
|
|
async def _ensure_session_queen_identity(
|
|
self,
|
|
session: Session,
|
|
initial_prompt: str | None = None,
|
|
) -> dict:
|
|
"""Resolve the queen identity and return the loaded profile.
|
|
|
|
Sets ``session.queen_name`` and returns the validated profile dict.
|
|
The caller can pass the profile directly to the orchestrator without
|
|
re-loading from disk.
|
|
"""
|
|
from framework.agents.queen.queen_profiles import (
|
|
ensure_default_queens,
|
|
load_queen_profile,
|
|
select_queen,
|
|
)
|
|
|
|
ensure_default_queens()
|
|
|
|
candidates: list[str] = []
|
|
current_queen = (session.queen_name or "").strip()
|
|
if current_queen and current_queen != "default":
|
|
candidates.append(current_queen)
|
|
|
|
if session.queen_resume_from:
|
|
resumed_queen = self._resume_queen_name(session.queen_resume_from)
|
|
if resumed_queen and resumed_queen not in candidates:
|
|
candidates.append(resumed_queen)
|
|
|
|
for queen_id in candidates:
|
|
try:
|
|
profile = load_queen_profile(queen_id)
|
|
except FileNotFoundError:
|
|
logger.warning("Session '%s': queen profile '%s' not found", session.id, queen_id)
|
|
continue
|
|
session.queen_name = queen_id
|
|
return profile
|
|
|
|
selector_input = initial_prompt or ""
|
|
queen_id = await select_queen(selector_input, session.llm)
|
|
session.queen_name = queen_id
|
|
return load_queen_profile(queen_id)
|
|
|
|
async def create_session(
|
|
self,
|
|
session_id: str | None = None,
|
|
model: str | None = None,
|
|
initial_prompt: str | None = None,
|
|
queen_resume_from: str | None = None,
|
|
queen_name: str | None = None,
|
|
initial_phase: str | None = None,
|
|
) -> Session:
|
|
"""Create a new session with a queen but no worker.
|
|
|
|
When ``queen_resume_from`` is set the queen writes conversation messages
|
|
to that existing session's directory instead of creating a new one.
|
|
This preserves full conversation history across server restarts.
|
|
|
|
When ``queen_name`` is set the session is pre-bound to that queen
|
|
identity, skipping LLM auto-selection in the identity hook.
|
|
"""
|
|
# Reuse the original session ID when cold-restoring
|
|
resolved_session_id = queen_resume_from or session_id
|
|
session = await self._create_session_core(session_id=resolved_session_id, model=model)
|
|
session.queen_resume_from = queen_resume_from
|
|
if queen_name:
|
|
session.queen_name = queen_name
|
|
|
|
# Start queen immediately (queen-only, no worker tools yet)
|
|
await self._start_queen(
|
|
session,
|
|
worker_identity=None,
|
|
initial_prompt=initial_prompt,
|
|
initial_phase=initial_phase,
|
|
)
|
|
|
|
logger.info(
|
|
"Session '%s' created (queen-only, resume_from=%s)",
|
|
session.id,
|
|
queen_resume_from,
|
|
)
|
|
return session
|
|
|
|
async def create_session_with_worker_graph(
|
|
self,
|
|
agent_path: str | Path,
|
|
agent_id: str | None = None,
|
|
session_id: str | None = None,
|
|
model: str | None = None,
|
|
initial_prompt: str | None = None,
|
|
queen_resume_from: str | None = None,
|
|
queen_name: str | None = None,
|
|
initial_phase: str | None = None,
|
|
) -> Session:
|
|
"""Create a session and load a worker in one step.
|
|
|
|
When ``queen_resume_from`` is set the session reuses the original session
|
|
ID so the frontend sees a single continuous session. The queen writes
|
|
conversation messages to that existing directory, preserving full history.
|
|
"""
|
|
from framework.tools.queen_lifecycle_tools import build_worker_profile
|
|
|
|
agent_path = Path(agent_path)
|
|
resolved_graph_id = agent_id or agent_path.name
|
|
|
|
# When cold-restoring, check meta.json for the phase — if the agent
|
|
# was still being built we must NOT try to load the worker (the code
|
|
# is incomplete and will fail to import).
|
|
_resume_queen_id: str | None = None
|
|
if queen_resume_from:
|
|
_resume_phase = None
|
|
_meta_path = _find_queen_session_dir(queen_resume_from) / "meta.json"
|
|
if _meta_path.exists():
|
|
try:
|
|
_meta = json.loads(_meta_path.read_text(encoding="utf-8"))
|
|
_resume_phase = _meta.get("phase")
|
|
_resume_queen_id = _meta.get("queen_id")
|
|
except (json.JSONDecodeError, OSError):
|
|
pass
|
|
if _resume_phase in ("building", "planning"):
|
|
# Fall back to queen-only session — cold resume handler in
|
|
# _start_queen will set phase_state.agent_path and switch to
|
|
# the correct phase.
|
|
return await self.create_session(
|
|
session_id=session_id,
|
|
model=model,
|
|
initial_prompt=initial_prompt,
|
|
queen_resume_from=queen_resume_from,
|
|
)
|
|
|
|
# Reuse the original session ID when cold-restoring so the frontend
|
|
# sees one continuous session instead of a new one each time.
|
|
session = await self._create_session_core(
|
|
session_id=queen_resume_from,
|
|
model=model,
|
|
)
|
|
session.queen_resume_from = queen_resume_from
|
|
if queen_name:
|
|
session.queen_name = queen_name
|
|
elif _resume_queen_id:
|
|
session.queen_name = _resume_queen_id
|
|
try:
|
|
# Load the graph FIRST (before queen) so queen gets full tools
|
|
await self._load_worker_core(
|
|
session,
|
|
agent_path,
|
|
graph_id=resolved_graph_id,
|
|
model=model,
|
|
)
|
|
|
|
# Restore active triggers from persisted state (cold restore)
|
|
await self._restore_active_triggers(session, session.id)
|
|
|
|
# Start queen with worker profile + lifecycle + monitoring tools
|
|
worker_identity = (
|
|
build_worker_profile(session.graph_runtime, agent_path=agent_path)
|
|
if session.graph_runtime
|
|
else None
|
|
)
|
|
await self._start_queen(
|
|
session,
|
|
worker_identity=worker_identity,
|
|
initial_prompt=initial_prompt,
|
|
initial_phase=initial_phase,
|
|
)
|
|
|
|
except Exception:
|
|
if queen_resume_from:
|
|
# Cold restore: worker load failed (e.g. incomplete code from a
|
|
# building session). Fall back to queen-only so the user can
|
|
# continue the conversation and fix / rebuild the agent.
|
|
logger.warning(
|
|
"Cold restore: worker load failed for '%s', falling back to queen-only",
|
|
agent_path,
|
|
exc_info=True,
|
|
)
|
|
await self.stop_session(session.id)
|
|
return await self.create_session(
|
|
session_id=session_id,
|
|
model=model,
|
|
initial_prompt=initial_prompt,
|
|
queen_resume_from=queen_resume_from,
|
|
)
|
|
# If anything fails (non-cold-restore), tear down the session
|
|
await self.stop_session(session.id)
|
|
raise
|
|
return session
|
|
|
|
# ------------------------------------------------------------------
|
|
# Worker lifecycle
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _load_worker_core(
|
|
self,
|
|
session: Session,
|
|
agent_path: str | Path,
|
|
graph_id: str | None = None,
|
|
model: str | None = None,
|
|
) -> None:
|
|
"""Load a graph into a session (core logic).
|
|
|
|
Sets up the runner, runtime, and session fields. Does NOT notify
|
|
the queen — callers handle that step.
|
|
"""
|
|
from framework.loader import AgentLoader
|
|
|
|
agent_path = Path(agent_path)
|
|
resolved_graph_id = graph_id or agent_path.name
|
|
|
|
if session.graph_runtime is not None:
|
|
raise ValueError(f"Session '{session.id}' already has graph '{session.graph_id}'")
|
|
|
|
async with self._lock:
|
|
if session.id in self._loading:
|
|
raise ValueError(f"Session '{session.id}' is currently loading a graph")
|
|
self._loading.add(session.id)
|
|
|
|
try:
|
|
# Blocking I/O — load in executor
|
|
loop = asyncio.get_running_loop()
|
|
# By default, workers share the session's LLM with the queen so
|
|
# execution and memory reflection/recall stay on the same model.
|
|
session_model = getattr(session.llm, "model", None)
|
|
resolved_model = model or session_model or self._model
|
|
runner = await loop.run_in_executor(
|
|
None,
|
|
lambda: AgentLoader.load(
|
|
agent_path,
|
|
model=resolved_model,
|
|
interactive=False,
|
|
skip_credential_validation=True,
|
|
credential_store=self._credential_store,
|
|
),
|
|
)
|
|
|
|
if model is None:
|
|
runner._llm = session.llm
|
|
|
|
# Setup with session's event bus
|
|
if runner._agent_runtime is None:
|
|
await loop.run_in_executor(
|
|
None,
|
|
lambda: runner._setup(event_bus=session.event_bus),
|
|
)
|
|
|
|
runtime = runner._agent_runtime
|
|
|
|
# Load triggers from the agent's triggers.json definition file.
|
|
from framework.tools.queen_lifecycle_tools import _read_agent_triggers_json
|
|
|
|
for tdata in _read_agent_triggers_json(agent_path):
|
|
tid = tdata.get("id", "")
|
|
ttype = tdata.get("trigger_type", "")
|
|
if tid and ttype in ("timer", "webhook"):
|
|
session.available_triggers[tid] = TriggerDefinition(
|
|
id=tid,
|
|
trigger_type=ttype,
|
|
trigger_config=tdata.get("trigger_config", {}),
|
|
description=tdata.get("name", tid),
|
|
task=tdata.get("task", ""),
|
|
)
|
|
logger.info("Loaded trigger '%s' (%s) from triggers.json", tid, ttype)
|
|
|
|
if session.available_triggers:
|
|
await self._emit_trigger_events(session, "available", session.available_triggers)
|
|
|
|
# Start runtime on event loop
|
|
if runtime and not runtime.is_running:
|
|
await runtime.start()
|
|
|
|
# Clean up stale "active" sessions from previous (dead) processes
|
|
self._cleanup_stale_active_sessions(agent_path)
|
|
|
|
info = runner.info()
|
|
|
|
# Update session
|
|
session.graph_id = resolved_graph_id
|
|
session.worker_path = agent_path
|
|
session.runner = runner
|
|
session.graph_runtime = runtime
|
|
session.worker_info = info
|
|
|
|
async with self._lock:
|
|
self._loading.discard(session.id)
|
|
|
|
logger.info(
|
|
"Worker '%s' loaded into session '%s'",
|
|
resolved_graph_id,
|
|
session.id,
|
|
)
|
|
|
|
except Exception:
|
|
async with self._lock:
|
|
self._loading.discard(session.id)
|
|
raise
|
|
|
|
def _cleanup_stale_active_sessions(self, agent_path: Path) -> None:
|
|
"""Mark stale 'active' sessions on disk as 'cancelled'.
|
|
|
|
When a new runtime starts, any on-disk session still marked 'active'
|
|
is from a process that no longer exists. 'Paused' sessions are left
|
|
intact so they remain resumable.
|
|
|
|
Two-layer protection against corrupting live sessions:
|
|
1. In-memory: skip any session ID currently tracked in self._sessions
|
|
(guaranteed alive in this process).
|
|
2. PID validation: if state.json contains a ``pid`` field, check whether
|
|
that process is still running on the host. If it is, the session is
|
|
owned by another healthy worker process, so leave it alone.
|
|
"""
|
|
sessions_path = Path.home() / ".hive" / "agents" / agent_path.name / "sessions"
|
|
if not sessions_path.exists():
|
|
return
|
|
|
|
live_session_ids = set(self._sessions.keys())
|
|
|
|
for d in sessions_path.iterdir():
|
|
if not d.is_dir() or not d.name.startswith("session_"):
|
|
continue
|
|
state_path = d / "state.json"
|
|
if not state_path.exists():
|
|
continue
|
|
try:
|
|
state = json.loads(state_path.read_text(encoding="utf-8"))
|
|
if state.get("status") != "active":
|
|
continue
|
|
|
|
# Layer 1: skip sessions that are alive in this process
|
|
session_id = state.get("session_id", d.name)
|
|
if session_id in live_session_ids or d.name in live_session_ids:
|
|
logger.debug(
|
|
"Skipping live in-memory session '%s' during stale cleanup",
|
|
d.name,
|
|
)
|
|
continue
|
|
|
|
# Layer 2: skip sessions whose owning process is still alive
|
|
recorded_pid = state.get("pid")
|
|
if recorded_pid is not None and self._is_pid_alive(recorded_pid):
|
|
logger.debug(
|
|
"Skipping session '%s' — owning process %d is still running",
|
|
d.name,
|
|
recorded_pid,
|
|
)
|
|
continue
|
|
|
|
state["status"] = "cancelled"
|
|
state.setdefault("result", {})["error"] = "Stale session: runtime restarted"
|
|
state.setdefault("timestamps", {})["updated_at"] = datetime.now().isoformat()
|
|
state_path.write_text(json.dumps(state, indent=2), encoding="utf-8")
|
|
logger.info(
|
|
"Marked stale session '%s' as cancelled for agent '%s'", d.name, agent_path.name
|
|
)
|
|
except (json.JSONDecodeError, OSError) as e:
|
|
logger.warning("Failed to clean up stale session %s: %s", d.name, e)
|
|
|
|
@staticmethod
|
|
def _is_pid_alive(pid: int) -> bool:
|
|
"""Check whether a process with the given PID is still running."""
|
|
import os
|
|
import platform
|
|
|
|
if platform.system() == "Windows":
|
|
import ctypes
|
|
|
|
# PROCESS_QUERY_LIMITED_INFORMATION = 0x1000
|
|
kernel32 = ctypes.windll.kernel32
|
|
handle = kernel32.OpenProcess(0x1000, False, pid)
|
|
if not handle:
|
|
# 5 is ERROR_ACCESS_DENIED, meaning the process exists but is protected
|
|
return kernel32.GetLastError() == 5
|
|
|
|
exit_code = ctypes.c_ulong()
|
|
kernel32.GetExitCodeProcess(handle, ctypes.byref(exit_code))
|
|
kernel32.CloseHandle(handle)
|
|
# 259 is STILL_ACTIVE
|
|
return exit_code.value == 259
|
|
else:
|
|
try:
|
|
os.kill(pid, 0)
|
|
except OSError:
|
|
return False
|
|
return True
|
|
|
|
async def _restore_active_triggers(self, session: "Session", session_id: str) -> None:
|
|
"""Restore previously active triggers from persisted session state.
|
|
|
|
Called after worker loading to restart any timer/webhook triggers
|
|
that were active before a server restart.
|
|
"""
|
|
if not session.available_triggers or not session.graph_runtime:
|
|
return
|
|
try:
|
|
store = session.graph_runtime._session_store
|
|
state = await store.read_state(session_id)
|
|
if state and state.active_triggers:
|
|
from framework.tools.queen_lifecycle_tools import (
|
|
_start_trigger_timer,
|
|
_start_trigger_webhook,
|
|
)
|
|
|
|
saved_tasks = getattr(state, "trigger_tasks", {}) or {}
|
|
for tid in state.active_triggers:
|
|
tdef = session.available_triggers.get(tid)
|
|
if tdef:
|
|
# Restore user-configured task override
|
|
saved_task = saved_tasks.get(tid, "")
|
|
if saved_task:
|
|
tdef.task = saved_task
|
|
tdef.active = True
|
|
session.active_trigger_ids.add(tid)
|
|
if tdef.trigger_type == "timer":
|
|
await _start_trigger_timer(session, tid, tdef)
|
|
logger.info("Restored trigger timer '%s'", tid)
|
|
elif tdef.trigger_type == "webhook":
|
|
await _start_trigger_webhook(session, tid, tdef)
|
|
logger.info("Restored webhook trigger '%s'", tid)
|
|
else:
|
|
logger.warning(
|
|
"Saved trigger '%s' not found in worker entry points, skipping",
|
|
tid,
|
|
)
|
|
|
|
# Restore worker_configured flag
|
|
if state and getattr(state, "worker_configured", False):
|
|
session.worker_configured = True
|
|
except Exception as e:
|
|
logger.warning("Failed to restore active triggers: %s", e)
|
|
|
|
async def load_graph(
|
|
self,
|
|
session_id: str,
|
|
agent_path: str | Path,
|
|
graph_id: str | None = None,
|
|
model: str | None = None,
|
|
) -> Session:
|
|
"""Load a graph into an existing session (with running queen).
|
|
|
|
Starts the graph runtime and notifies the queen.
|
|
"""
|
|
agent_path = Path(agent_path)
|
|
|
|
session = self._sessions.get(session_id)
|
|
if session is None:
|
|
raise ValueError(f"Session '{session_id}' not found")
|
|
|
|
await self._load_worker_core(
|
|
session,
|
|
agent_path,
|
|
graph_id=graph_id,
|
|
model=model,
|
|
)
|
|
|
|
# Notify queen about the loaded worker (skip for queen itself).
|
|
if agent_path.name != "queen" and session.graph_runtime:
|
|
await self._notify_queen_graph_loaded(session)
|
|
|
|
# Update meta.json so cold-restore can discover this session by agent_path
|
|
storage_session_id = session.queen_resume_from or session.id
|
|
meta_path = _queen_session_dir(storage_session_id, session.queen_name) / "meta.json"
|
|
try:
|
|
_agent_name = (
|
|
session.worker_info.name
|
|
if session.worker_info
|
|
else str(agent_path.name).replace("_", " ").title()
|
|
)
|
|
existing_meta = {}
|
|
if meta_path.exists():
|
|
existing_meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
|
existing_meta["agent_name"] = _agent_name
|
|
existing_meta["agent_path"] = (
|
|
str(session.worker_path) if session.worker_path else str(agent_path)
|
|
)
|
|
meta_path.write_text(json.dumps(existing_meta), encoding="utf-8")
|
|
except OSError:
|
|
pass
|
|
|
|
await self._restore_active_triggers(session, session_id)
|
|
|
|
# Emit SSE event so the frontend can update UI
|
|
await self._emit_graph_loaded(session)
|
|
|
|
return session
|
|
|
|
async def unload_graph(self, session_id: str) -> bool:
|
|
"""Unload the worker from a session. Queen stays alive."""
|
|
session = self._sessions.get(session_id)
|
|
if session is None:
|
|
return False
|
|
if session.graph_runtime is None:
|
|
return False
|
|
|
|
# Cleanup worker
|
|
if session.runner:
|
|
try:
|
|
await session.runner.cleanup_async()
|
|
except Exception as e:
|
|
logger.error("Error cleaning up graph '%s': %s", session.graph_id, e)
|
|
|
|
# Cancel active trigger timers
|
|
for tid, task in session.active_timer_tasks.items():
|
|
task.cancel()
|
|
logger.info("Cancelled trigger timer '%s' on unload", tid)
|
|
session.active_timer_tasks.clear()
|
|
|
|
# Unsubscribe webhook handlers (server stays alive — queen-owned)
|
|
for sub_id in session.active_webhook_subs.values():
|
|
try:
|
|
session.event_bus.unsubscribe(sub_id)
|
|
except Exception:
|
|
pass
|
|
session.active_webhook_subs.clear()
|
|
session.active_trigger_ids.clear()
|
|
|
|
# Clean up triggers
|
|
if session.available_triggers:
|
|
await self._emit_trigger_events(session, "removed", session.available_triggers)
|
|
session.available_triggers.clear()
|
|
|
|
graph_id = session.graph_id
|
|
session.graph_id = None
|
|
session.worker_path = None
|
|
session.runner = None
|
|
session.graph_runtime = None
|
|
session.worker_info = None
|
|
|
|
# Notify queen
|
|
await self._notify_queen_worker_unloaded(session)
|
|
|
|
logger.info("Graph '%s' unloaded from session '%s'", graph_id, session_id)
|
|
return True
|
|
|
|
# ------------------------------------------------------------------
|
|
# Session teardown
|
|
# ------------------------------------------------------------------
|
|
|
|
async def stop_session(self, session_id: str) -> bool:
|
|
"""Stop a session entirely — unload worker + cancel queen."""
|
|
async with self._lock:
|
|
session = self._sessions.pop(session_id, None)
|
|
|
|
if session is None:
|
|
return False
|
|
|
|
if session.worker_handoff_sub is not None:
|
|
try:
|
|
session.event_bus.unsubscribe(session.worker_handoff_sub)
|
|
except Exception:
|
|
pass
|
|
session.worker_handoff_sub = None
|
|
|
|
# Stop memory reflection/recall subscriptions
|
|
for sub_id in session.memory_reflection_subs:
|
|
try:
|
|
session.event_bus.unsubscribe(sub_id)
|
|
except Exception:
|
|
pass
|
|
session.memory_reflection_subs.clear()
|
|
|
|
# Run a final shutdown reflection so recent conversation insights
|
|
# are persisted before the session is destroyed (fire-and-forget).
|
|
if session.queen_dir is not None:
|
|
try:
|
|
from framework.agents.queen.queen_memory_v2 import (
|
|
global_memory_dir,
|
|
queen_memory_dir,
|
|
)
|
|
from framework.agents.queen.reflection_agent import run_shutdown_reflection
|
|
|
|
global_mem_dir = global_memory_dir()
|
|
queen_mem_dir = queen_memory_dir(session.queen_name)
|
|
if session.phase_state is not None:
|
|
global_mem_dir = session.phase_state.global_memory_dir or global_mem_dir
|
|
queen_mem_dir = session.phase_state.queen_memory_dir or queen_mem_dir
|
|
|
|
task = asyncio.create_task(
|
|
asyncio.shield(
|
|
run_shutdown_reflection(
|
|
session.queen_dir,
|
|
session.llm,
|
|
global_memory_dir_override=global_mem_dir,
|
|
queen_memory_dir=queen_mem_dir,
|
|
queen_id=session.queen_name,
|
|
)
|
|
),
|
|
name=f"shutdown-reflect-{session_id}",
|
|
)
|
|
logger.info("Session '%s': shutdown reflection spawned", session_id)
|
|
self._background_tasks.add(task)
|
|
task.add_done_callback(self._background_tasks.discard)
|
|
except Exception:
|
|
logger.warning(
|
|
"Session '%s': failed to spawn shutdown reflection", session_id, exc_info=True
|
|
)
|
|
|
|
if session.queen_task is not None:
|
|
session.queen_task.cancel()
|
|
session.queen_task = None
|
|
session.queen_executor = None
|
|
|
|
# Cancel active trigger timers
|
|
for task in session.active_timer_tasks.values():
|
|
task.cancel()
|
|
session.active_timer_tasks.clear()
|
|
|
|
# Unsubscribe webhook handlers and stop queen webhook server
|
|
for sub_id in session.active_webhook_subs.values():
|
|
try:
|
|
session.event_bus.unsubscribe(sub_id)
|
|
except Exception:
|
|
pass
|
|
session.active_webhook_subs.clear()
|
|
if session.queen_webhook_server is not None:
|
|
try:
|
|
await session.queen_webhook_server.stop()
|
|
except Exception:
|
|
logger.error("Error stopping queen webhook server", exc_info=True)
|
|
session.queen_webhook_server = None
|
|
|
|
# Cleanup worker
|
|
if session.runner:
|
|
try:
|
|
await session.runner.cleanup_async()
|
|
except Exception as e:
|
|
logger.error("Error cleaning up worker: %s", e)
|
|
|
|
# Close per-session event log
|
|
session.event_bus.close_session_log()
|
|
|
|
logger.info("Session '%s' stopped", session_id)
|
|
return True
|
|
|
|
# ------------------------------------------------------------------
|
|
# Queen startup
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _handle_worker_handoff(self, session: Session, executor: Any, event: Any) -> None:
|
|
"""Route worker escalation events into the queen conversation."""
|
|
if event.stream_id == "queen":
|
|
return
|
|
|
|
reason = str(event.data.get("reason", "")).strip()
|
|
context = str(event.data.get("context", "")).strip()
|
|
node_label = event.node_id or "unknown_node"
|
|
stream_label = event.stream_id or "unknown_stream"
|
|
|
|
handoff = (
|
|
"[WORKER_ESCALATION_REQUEST]\n"
|
|
f"stream_id: {stream_label}\n"
|
|
f"node_id: {node_label}\n"
|
|
f"reason: {reason or 'unspecified'}\n"
|
|
)
|
|
if context:
|
|
handoff += f"context:\n{context}\n"
|
|
|
|
node = executor.node_registry.get("queen")
|
|
if node is not None and hasattr(node, "inject_event"):
|
|
await node.inject_event(handoff, is_client_input=False)
|
|
else:
|
|
logger.warning("Worker handoff received but queen node not ready")
|
|
|
|
def _subscribe_worker_handoffs(self, session: Session, executor: Any) -> None:
|
|
"""Subscribe queen to worker/subagent escalation handoff events."""
|
|
from framework.host.event_bus import EventType as _ET
|
|
|
|
if session.worker_handoff_sub is not None:
|
|
session.event_bus.unsubscribe(session.worker_handoff_sub)
|
|
session.worker_handoff_sub = None
|
|
|
|
async def _on_worker_handoff(event):
|
|
await self._handle_worker_handoff(session, executor, event)
|
|
|
|
session.worker_handoff_sub = session.event_bus.subscribe(
|
|
event_types=[_ET.ESCALATION_REQUESTED],
|
|
handler=_on_worker_handoff,
|
|
)
|
|
|
|
async def _start_queen(
|
|
self,
|
|
session: Session,
|
|
worker_identity: str | None,
|
|
initial_prompt: str | None = None,
|
|
initial_phase: str | None = None,
|
|
) -> None:
|
|
"""Start the queen executor for a session.
|
|
|
|
When ``session.queen_resume_from`` is set, queen conversation messages
|
|
are written to the ORIGINAL session's directory so the full conversation
|
|
history accumulates in one place across server restarts.
|
|
"""
|
|
from framework.server.queen_orchestrator import create_queen
|
|
|
|
logger.debug(
|
|
"[_start_queen] Starting for session %s, current queen_executor=%s",
|
|
session.id,
|
|
session.queen_executor,
|
|
)
|
|
|
|
queen_profile = await self._ensure_session_queen_identity(session, initial_prompt)
|
|
|
|
# Determine which session directory to use for queen storage.
|
|
# When queen_resume_from is set we write to the ORIGINAL session's
|
|
# directory so that all messages accumulate in one place.
|
|
storage_session_id = session.queen_resume_from or session.id
|
|
queen_dir = _queen_session_dir(storage_session_id, session.queen_name)
|
|
queen_dir.mkdir(parents=True, exist_ok=True)
|
|
session.queen_dir = queen_dir
|
|
|
|
# Always write/update session metadata so history sidebar has correct
|
|
# agent name, path, and last-active timestamp (important so the original
|
|
# session directory sorts as "most recent" after a cold-restore resume).
|
|
_meta_path = queen_dir / "meta.json"
|
|
try:
|
|
_agent_name = (
|
|
session.worker_info.name
|
|
if session.worker_info
|
|
else (
|
|
str(session.worker_path.name).replace("_", " ").title()
|
|
if session.worker_path
|
|
else None
|
|
)
|
|
)
|
|
# Merge into existing meta.json to preserve fields written by
|
|
# _update_meta_json (e.g. phase, agent_path set during building).
|
|
_existing_meta: dict = {}
|
|
if _meta_path.exists():
|
|
try:
|
|
_existing_meta = json.loads(_meta_path.read_text(encoding="utf-8"))
|
|
except (json.JSONDecodeError, OSError):
|
|
pass
|
|
_new_meta: dict = {
|
|
"created_at": time.time(),
|
|
"queen_id": session.queen_name,
|
|
}
|
|
if _agent_name is not None:
|
|
_new_meta["agent_name"] = _agent_name
|
|
if session.worker_path is not None:
|
|
_new_meta["agent_path"] = str(session.worker_path)
|
|
_existing_meta.update(_new_meta)
|
|
_meta_path.write_text(json.dumps(_existing_meta), encoding="utf-8")
|
|
except OSError:
|
|
pass
|
|
|
|
# Enable per-session event persistence so that all eventbus events
|
|
# survive server restarts and can be replayed on cold-session resume.
|
|
# Scan the existing event log to find the max iteration ever written,
|
|
# then use max+1 as offset so resumed sessions produce monotonically
|
|
# increasing iteration values — preventing frontend message ID collisions.
|
|
iteration_offset = 0
|
|
last_phase = ""
|
|
events_path = queen_dir / "events.jsonl"
|
|
try:
|
|
if events_path.exists():
|
|
max_iter = -1
|
|
with open(events_path, encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
evt = json.loads(line)
|
|
data = evt.get("data", {})
|
|
it = data.get("iteration")
|
|
if isinstance(it, int) and it > max_iter:
|
|
max_iter = it
|
|
# Track the latest queen phase from QUEEN_PHASE_CHANGED events
|
|
if evt.get("type") == "queen_phase_changed":
|
|
phase = data.get("phase")
|
|
if phase:
|
|
last_phase = phase
|
|
except (json.JSONDecodeError, TypeError):
|
|
continue
|
|
if max_iter >= 0:
|
|
iteration_offset = max_iter + 1
|
|
logger.info(
|
|
"Session '%s' resuming with iteration_offset=%d"
|
|
" (from events.jsonl max), last phase: %s",
|
|
session.id,
|
|
iteration_offset,
|
|
last_phase or "unknown",
|
|
)
|
|
except OSError:
|
|
pass
|
|
session.event_bus.set_session_log(events_path, iteration_offset=iteration_offset)
|
|
|
|
logger.debug("[_start_queen] Calling create_queen...")
|
|
session.queen_task = await create_queen(
|
|
session=session,
|
|
session_manager=self,
|
|
worker_identity=worker_identity,
|
|
queen_dir=queen_dir,
|
|
queen_profile=queen_profile,
|
|
initial_prompt=initial_prompt,
|
|
initial_phase=initial_phase,
|
|
tool_registry=self._queen_tool_registry,
|
|
)
|
|
logger.debug(
|
|
"[_start_queen] create_queen returned, queen_task=%s, queen_executor=%s",
|
|
session.queen_task,
|
|
session.queen_executor,
|
|
)
|
|
|
|
# Auto-load worker on cold restore — the queen's conversation expects
|
|
# the agent to be loaded, but the new session has no worker.
|
|
if session.queen_resume_from and not session.graph_runtime:
|
|
meta_path = queen_dir / "meta.json"
|
|
if meta_path.exists():
|
|
try:
|
|
_meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
|
_agent_path = _meta.get("agent_path")
|
|
_phase = _meta.get("phase")
|
|
|
|
if _agent_path and Path(_agent_path).exists():
|
|
if _phase in ("staging", "running", None):
|
|
# Agent fully built — load worker and resume
|
|
await self.load_graph(session.id, _agent_path)
|
|
if session.phase_state:
|
|
await session.phase_state.switch_to_staging(source="auto")
|
|
# Emit flowchart overlay so frontend can display it
|
|
await self._emit_flowchart_on_restore(session, _agent_path)
|
|
logger.info("Cold restore: auto-loaded worker from %s", _agent_path)
|
|
elif _phase == "building":
|
|
# Agent folder exists but incomplete — resume building
|
|
if session.phase_state:
|
|
session.phase_state.agent_path = _agent_path
|
|
await session.phase_state.switch_to_building(source="auto")
|
|
logger.info("Cold restore: resumed BUILDING phase for %s", _agent_path)
|
|
elif _phase == "planning":
|
|
if session.phase_state:
|
|
session.phase_state.agent_path = _agent_path
|
|
logger.info("Cold restore: PLANNING phase for %s", _agent_path)
|
|
except Exception:
|
|
logger.warning("Cold restore: failed to auto-load worker", exc_info=True)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Queen notifications
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _notify_queen_graph_loaded(self, session: Session) -> None:
|
|
"""Inject a system message into the queen about the loaded graph."""
|
|
from framework.tools.queen_lifecycle_tools import build_worker_profile
|
|
|
|
executor = session.queen_executor
|
|
if executor is None:
|
|
return
|
|
node = executor.node_registry.get("queen")
|
|
if node is None or not hasattr(node, "inject_event"):
|
|
return
|
|
|
|
profile = build_worker_profile(session.graph_runtime, agent_path=session.worker_path)
|
|
|
|
# Append available trigger info so the queen knows what's schedulable
|
|
trigger_lines = ""
|
|
if session.available_triggers:
|
|
parts = []
|
|
for t in session.available_triggers.values():
|
|
cfg = t.trigger_config
|
|
detail = cfg.get("cron") or f"every {cfg.get('interval_minutes', '?')} min"
|
|
task_info = f' -> task: "{t.task}"' if t.task else " (no task configured)"
|
|
parts.append(f" - {t.id} ({t.trigger_type}: {detail}){task_info}")
|
|
trigger_lines = (
|
|
"\n\nAvailable triggers (inactive — use set_trigger to activate):\n"
|
|
+ "\n".join(parts)
|
|
)
|
|
|
|
await node.inject_event(f"[SYSTEM] Graph loaded.{profile}{trigger_lines}")
|
|
|
|
async def _emit_graph_loaded(self, session: Session) -> None:
|
|
"""Publish a WORKER_GRAPH_LOADED event so the frontend can update."""
|
|
from framework.host.event_bus import AgentEvent, EventType
|
|
|
|
info = session.worker_info
|
|
await session.event_bus.publish(
|
|
AgentEvent(
|
|
type=EventType.WORKER_GRAPH_LOADED,
|
|
stream_id="queen",
|
|
data={
|
|
"graph_id": session.graph_id,
|
|
"graph_name": info.name if info else session.graph_id,
|
|
"agent_path": str(session.worker_path) if session.worker_path else "",
|
|
"goal": info.goal_name if info else "",
|
|
"node_count": info.node_count if info else 0,
|
|
},
|
|
)
|
|
)
|
|
|
|
async def _emit_flowchart_on_restore(self, session: Session, agent_path: str | Path) -> None:
|
|
"""Emit FLOWCHART_MAP_UPDATED from persisted flowchart file on cold restore."""
|
|
from framework.host.event_bus import AgentEvent, EventType
|
|
from framework.tools.flowchart_utils import load_flowchart_file
|
|
|
|
original_draft, flowchart_map = load_flowchart_file(agent_path)
|
|
if original_draft is None:
|
|
return
|
|
# Cache in phase_state so the REST endpoint also returns it
|
|
if session.phase_state:
|
|
session.phase_state.original_draft_graph = original_draft
|
|
session.phase_state.flowchart_map = flowchart_map
|
|
await session.event_bus.publish(
|
|
AgentEvent(
|
|
type=EventType.FLOWCHART_MAP_UPDATED,
|
|
stream_id="queen",
|
|
data={
|
|
"map": flowchart_map,
|
|
"original_draft": original_draft,
|
|
},
|
|
)
|
|
)
|
|
|
|
async def _notify_queen_worker_unloaded(self, session: Session) -> None:
|
|
"""Notify the queen that the worker has been unloaded."""
|
|
executor = session.queen_executor
|
|
if executor is None:
|
|
return
|
|
node = executor.node_registry.get("queen")
|
|
if node is None or not hasattr(node, "inject_event"):
|
|
return
|
|
|
|
await node.inject_event(
|
|
"[SYSTEM] Worker unloaded. You are now operating independently. "
|
|
"Design or build the agent to solve the user's problem "
|
|
"according to your current phase."
|
|
)
|
|
|
|
async def _emit_trigger_events(
|
|
self,
|
|
session: Session,
|
|
kind: str,
|
|
triggers: dict[str, TriggerDefinition],
|
|
) -> None:
|
|
"""Emit TRIGGER_AVAILABLE or TRIGGER_REMOVED events for each trigger."""
|
|
from framework.host.event_bus import AgentEvent, EventType
|
|
|
|
event_type = (
|
|
EventType.TRIGGER_AVAILABLE if kind == "available" else EventType.TRIGGER_REMOVED
|
|
)
|
|
# Resolve graph entry node for trigger target
|
|
runner = getattr(session, "runner", None)
|
|
graph_entry = runner.graph.entry_node if runner else None
|
|
|
|
for t in triggers.values():
|
|
await session.event_bus.publish(
|
|
AgentEvent(
|
|
type=event_type,
|
|
stream_id="queen",
|
|
data={
|
|
"trigger_id": t.id,
|
|
"trigger_type": t.trigger_type,
|
|
"trigger_config": t.trigger_config,
|
|
"name": t.description or t.id,
|
|
**({"entry_node": graph_entry} if graph_entry else {}),
|
|
},
|
|
)
|
|
)
|
|
|
|
async def revive_queen(self, session: Session) -> None:
|
|
"""Revive a dead queen executor on an existing session.
|
|
|
|
Restarts the queen with the same session context (worker profile, tools, etc.).
|
|
"""
|
|
from framework.tools.queen_lifecycle_tools import build_worker_profile
|
|
|
|
logger.debug(
|
|
"[revive_queen] Starting revival for session '%s', current queen_executor=%s",
|
|
session.id,
|
|
session.queen_executor,
|
|
)
|
|
|
|
# Build worker identity if worker is loaded
|
|
worker_identity = (
|
|
build_worker_profile(session.graph_runtime, agent_path=session.worker_path)
|
|
if session.graph_runtime
|
|
else None
|
|
)
|
|
logger.debug("[revive_queen] worker_identity=%s", "present" if worker_identity else "None")
|
|
|
|
# Start queen with existing session context
|
|
logger.debug("[revive_queen] Calling _start_queen...")
|
|
await self._start_queen(session, worker_identity=worker_identity)
|
|
|
|
logger.info(
|
|
"Queen revived for session '%s', new queen_executor=%s",
|
|
session.id,
|
|
session.queen_executor,
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Lookups
|
|
# ------------------------------------------------------------------
|
|
|
|
def get_session(self, session_id: str) -> Session | None:
|
|
return self._sessions.get(session_id)
|
|
|
|
def get_session_by_graph_id(self, graph_id: str) -> Session | None:
|
|
"""Find a session by its loaded graph's ID."""
|
|
for s in self._sessions.values():
|
|
if s.graph_id == graph_id:
|
|
return s
|
|
return None
|
|
|
|
def get_session_for_agent(self, agent_id: str) -> Session | None:
|
|
"""Resolve an agent_id to a session (backward compat).
|
|
|
|
Checks session.id first, then session.graph_id.
|
|
"""
|
|
s = self._sessions.get(agent_id)
|
|
if s:
|
|
return s
|
|
return self.get_session_by_graph_id(agent_id)
|
|
|
|
def is_loading(self, session_id: str) -> bool:
|
|
return session_id in self._loading
|
|
|
|
def list_sessions(self) -> list[Session]:
|
|
return list(self._sessions.values())
|
|
|
|
# ------------------------------------------------------------------
|
|
# Cold session helpers (disk-only, no live runtime required)
|
|
# ------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def get_cold_session_info(session_id: str) -> dict | None:
|
|
"""Return disk metadata for a session that is no longer live in memory.
|
|
|
|
Checks whether queen conversation files exist at
|
|
~/.hive/agents/queens/{name}/sessions/{session_id}/conversations/. Returns None when
|
|
no data is found so callers can fall through to a 404.
|
|
"""
|
|
queen_dir = _find_queen_session_dir(session_id)
|
|
convs_dir = queen_dir / "conversations"
|
|
if not convs_dir.exists():
|
|
return None
|
|
|
|
# Check whether any message part files are actually present
|
|
has_messages = False
|
|
try:
|
|
# Flat layout: conversations/parts/*.json
|
|
flat_parts = convs_dir / "parts"
|
|
if flat_parts.exists() and any(f.suffix == ".json" for f in flat_parts.iterdir()):
|
|
has_messages = True
|
|
else:
|
|
# Node-based layout: conversations/<node_id>/parts/*.json
|
|
for node_dir in convs_dir.iterdir():
|
|
if not node_dir.is_dir() or node_dir.name == "parts":
|
|
continue
|
|
parts_dir = node_dir / "parts"
|
|
if parts_dir.exists() and any(f.suffix == ".json" for f in parts_dir.iterdir()):
|
|
has_messages = True
|
|
break
|
|
except OSError:
|
|
pass
|
|
|
|
try:
|
|
created_at = queen_dir.stat().st_ctime
|
|
except OSError:
|
|
created_at = 0.0
|
|
|
|
# Read extra metadata written at session start
|
|
agent_name: str | None = None
|
|
agent_path: str | None = None
|
|
meta_path = queen_dir / "meta.json"
|
|
if meta_path.exists():
|
|
try:
|
|
meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
|
agent_name = meta.get("agent_name")
|
|
agent_path = meta.get("agent_path")
|
|
created_at = meta.get("created_at") or created_at
|
|
except (json.JSONDecodeError, OSError):
|
|
pass
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"cold": True,
|
|
"live": False,
|
|
"has_messages": has_messages,
|
|
"created_at": created_at,
|
|
"agent_name": agent_name,
|
|
"agent_path": agent_path,
|
|
}
|
|
|
|
@staticmethod
|
|
def list_cold_sessions() -> list[dict]:
|
|
"""Return metadata for every queen session directory on disk, newest first."""
|
|
if not QUEENS_DIR.exists():
|
|
return []
|
|
|
|
# Collect session dirs from all queen identities
|
|
all_session_dirs: list[Path] = []
|
|
try:
|
|
for queen_dir in QUEENS_DIR.iterdir():
|
|
if not queen_dir.is_dir():
|
|
continue
|
|
sessions_dir = queen_dir / "sessions"
|
|
if sessions_dir.exists():
|
|
for d in sessions_dir.iterdir():
|
|
if d.is_dir():
|
|
all_session_dirs.append(d)
|
|
except OSError:
|
|
return []
|
|
|
|
# Sort all sessions by mtime, newest first
|
|
all_session_dirs.sort(key=lambda p: p.stat().st_mtime, reverse=True)
|
|
|
|
results: list[dict] = []
|
|
for d in all_session_dirs:
|
|
if not d.is_dir():
|
|
continue
|
|
try:
|
|
created_at = d.stat().st_ctime
|
|
except OSError:
|
|
created_at = 0.0
|
|
agent_name: str | None = None
|
|
agent_path: str | None = None
|
|
meta_path = d / "meta.json"
|
|
if meta_path.exists():
|
|
try:
|
|
meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
|
agent_name = meta.get("agent_name")
|
|
agent_path = meta.get("agent_path")
|
|
created_at = meta.get("created_at") or created_at
|
|
except (json.JSONDecodeError, OSError):
|
|
pass
|
|
|
|
# Build a quick preview of the last human/assistant exchange.
|
|
# We read all conversation parts, filter to client-facing messages,
|
|
# and return the last assistant message content as a snippet.
|
|
last_message: str | None = None
|
|
message_count: int = 0
|
|
convs_dir = d / "conversations"
|
|
if convs_dir.exists():
|
|
try:
|
|
all_parts: list[dict] = []
|
|
|
|
def _collect_parts(parts_dir: Path, _dest: list[dict] = all_parts) -> None:
|
|
if not parts_dir.exists():
|
|
return
|
|
for part_file in sorted(parts_dir.iterdir()):
|
|
if part_file.suffix != ".json":
|
|
continue
|
|
try:
|
|
part = json.loads(part_file.read_text(encoding="utf-8"))
|
|
part.setdefault("created_at", part_file.stat().st_mtime)
|
|
_dest.append(part)
|
|
except (json.JSONDecodeError, OSError):
|
|
continue
|
|
|
|
# Flat layout: conversations/parts/*.json
|
|
_collect_parts(convs_dir / "parts")
|
|
# Node-based layout: conversations/<node_id>/parts/*.json
|
|
for node_dir in convs_dir.iterdir():
|
|
if not node_dir.is_dir() or node_dir.name == "parts":
|
|
continue
|
|
_collect_parts(node_dir / "parts")
|
|
# Filter to client-facing messages only
|
|
client_msgs = [
|
|
p
|
|
for p in all_parts
|
|
if not p.get("is_transition_marker")
|
|
and p.get("role") != "tool"
|
|
and not (p.get("role") == "assistant" and p.get("tool_calls"))
|
|
]
|
|
client_msgs.sort(key=lambda m: m.get("created_at", m.get("seq", 0)))
|
|
message_count = len(client_msgs)
|
|
# Last assistant message as preview snippet
|
|
for msg in reversed(client_msgs):
|
|
content = msg.get("content") or ""
|
|
if isinstance(content, list):
|
|
# Anthropic-style content blocks
|
|
content = " ".join(
|
|
b.get("text", "")
|
|
for b in content
|
|
if isinstance(b, dict) and b.get("type") == "text"
|
|
)
|
|
if content and msg.get("role") == "assistant":
|
|
last_message = content[:120].strip()
|
|
break
|
|
except OSError:
|
|
pass
|
|
|
|
# Derive queen_id from directory structure: queens/{queen_id}/sessions/{session_id}
|
|
queen_id = d.parent.parent.name if d.parent.name == "sessions" else None
|
|
|
|
results.append(
|
|
{
|
|
"session_id": d.name,
|
|
"cold": True, # caller overrides for live sessions
|
|
"live": False,
|
|
"has_messages": convs_dir.exists() and message_count > 0,
|
|
"created_at": created_at,
|
|
"agent_name": agent_name,
|
|
"agent_path": agent_path,
|
|
"last_message": last_message,
|
|
"message_count": message_count,
|
|
"queen_id": queen_id,
|
|
}
|
|
)
|
|
|
|
return results
|
|
|
|
async def shutdown_all(self) -> None:
|
|
"""Gracefully stop all sessions. Called on server shutdown."""
|
|
session_ids = list(self._sessions.keys())
|
|
for sid in session_ids:
|
|
await self.stop_session(sid)
|
|
logger.info("All sessions stopped")
|