Merge branch 'feature/colony-sqlite' into feature/clean-context

This commit is contained in:
Timothy
2026-04-17 04:12:35 -07:00
44 changed files with 3414 additions and 492 deletions
+11
View File
@@ -0,0 +1,11 @@
import json
with open('/home/timothy/aden/hive/x_rapid_ledger.json', 'r') as f:
data = json.load(f)
data['replies'].append({
'original_preview': 'Alright, I give in. Heres my picture with the boss, courtesy of @johnkrausphotos. Oh, and hook em!'
})
with open('/home/timothy/aden/hive/x_rapid_ledger.json', 'w') as f:
json.dump(data, f, indent=2)
+11
View File
@@ -0,0 +1,11 @@
import json, sys
with open('/home/timothy/aden/hive/x_rapid_ledger.json', 'r') as f:
ledger = json.load(f)
text = sys.argv[1]
for r in ledger['replies']:
if r.get('original_preview') == text:
print("YES")
sys.exit(0)
print("NO")
+10 -2
View File
@@ -184,8 +184,16 @@ _QUEEN_INDEPENDENT_TOOLS = [
"search_files",
"run_command",
"undo_changes",
# Parallel fan-out (Phase 4 unified ColonyRuntime)
"run_parallel_workers",
# NOTE (2026-04-16): ``run_parallel_workers`` was removed from the
# independent phase. The queen's pure DM mode is for conversation
# with the user; spawning workers from here puts their activity
# into a chat surface that's supposed to stay queen↔user only.
# Users who want to fan out parallel work should (a) use
# ``create_colony`` to fork into a persistent colony (where
# worker activity has its own page), or (b) load an agent via
# build/stage and use ``run_parallel_workers`` in the running
# phase where a worker context already exists.
#
# Fork this session into a persistent colony for headless /
# recurring / background work that needs to keep running in
# parallel to (or after) this chat.
+48 -2
View File
@@ -631,6 +631,43 @@ class ColonyRuntime:
spawn_tools = tools if tools is not None else self._tools
spawn_executor = tool_executor or self._tool_executor
# Colony progress tracker: when the caller supplied a db_path
# in input_data, this worker is part of a SQLite task queue
# and must see the hive.colony-progress-tracker skill body in
# its system prompt from turn 0. Rebuild the catalog with the
# skill pre-activated; falls back to the colony default when
# no db_path is present.
_spawn_catalog = self.skills_catalog_prompt
_spawn_skill_dirs = self.skill_dirs
if isinstance(input_data, dict) and input_data.get("db_path"):
try:
from framework.skills.config import SkillsConfig
from framework.skills.manager import SkillsManager, SkillsManagerConfig
_pre = SkillsManager(
SkillsManagerConfig(
skills_config=SkillsConfig.from_agent_vars(
skills=["hive.colony-progress-tracker"],
),
)
)
_pre.load()
_spawn_catalog = _pre.skills_catalog_prompt
_spawn_skill_dirs = list(_pre.allowlisted_dirs) if hasattr(_pre, "allowlisted_dirs") else self.skill_dirs
logger.info(
"spawn: pre-activated hive.colony-progress-tracker "
"(catalog %d%d chars) for worker with db_path=%s",
len(self.skills_catalog_prompt),
len(_spawn_catalog),
input_data.get("db_path"),
)
except Exception as exc:
logger.warning(
"spawn: failed to pre-activate colony-progress-tracker "
"skill, falling back to base catalog: %s",
exc,
)
# Resolve the SSE stream_id once. When the caller didn't supply
# one we use the per-worker fan-out tag (filtered out by the
# SSE handler). When the caller passed an explicit value we
@@ -685,9 +722,9 @@ class ColonyRuntime:
llm=self._llm,
available_tools=list(spawn_tools),
accounts_prompt=self._accounts_prompt,
skills_catalog_prompt=self.skills_catalog_prompt,
skills_catalog_prompt=_spawn_catalog,
protocols_prompt=self.protocols_prompt,
skill_dirs=self.skill_dirs,
skill_dirs=_spawn_skill_dirs,
execution_id=worker_id,
stream_id=explicit_stream_id or f"worker:{worker_id}",
)
@@ -720,6 +757,8 @@ class ColonyRuntime:
async def spawn_batch(
self,
tasks: list[dict[str, Any]],
*,
tools_override: list[Any] | None = None,
) -> list[str]:
"""Spawn a batch of parallel workers, one per task spec.
@@ -732,6 +771,12 @@ class ColonyRuntime:
The overseer's ``run_parallel_workers`` tool is the usual
caller; it pairs ``spawn_batch`` + ``wait_for_worker_reports``
into a single fan-out/fan-in primitive.
When ``tools_override`` is supplied, every spawned worker
receives that tool list instead of the colony's default. Used
by ``run_parallel_workers`` to drop tools whose credentials
failed the pre-flight check (so the spawned workers don't
waste a startup trying to use them).
"""
worker_ids: list[str] = []
for spec in tasks:
@@ -743,6 +788,7 @@ class ColonyRuntime:
task=task_text,
count=1,
input_data=task_data or {"task": task_text},
tools=tools_override,
)
worker_ids.extend(ids)
return worker_ids
+491
View File
@@ -0,0 +1,491 @@
"""Per-colony SQLite task queue + progress ledger.
Every colony gets its own ``progress.db`` under ``~/.hive/colonies/{name}/data/``.
The DB holds the colony's task queue plus per-task step and SOP checklist
rows. Workers claim tasks atomically, write progress as they execute, and
verify SOP gates before marking a task done. This gives cross-run memory
that the existing per-iteration stall detectors don't have.
The DB is driven by agents via the ``sqlite3`` CLI through
``execute_command_tool``. This module handles framework-side lifecycle:
creation, migration, queen-side bulk seeding, stale-claim reclamation.
Concurrency model:
- WAL mode on from day one so 100 concurrent workers don't serialize.
- Workers hold NO long-running connection they ``sqlite3`` per call,
which naturally releases locks between LLM turns.
- Atomic claim via ``BEGIN IMMEDIATE; UPDATE tasks SET status='claimed'
WHERE id=(SELECT ... LIMIT 1)``. The subquery-form UPDATE runs inside
the immediate transaction so racers either win the row or find zero
affected rows.
- Stale-claim reclaimer runs on host startup: claims older than
``stale_after_minutes`` get returned to ``pending`` and the row's
``retry_count`` increments. When ``retry_count >= max_retries`` the
row is moved to ``failed`` instead.
All writes go through ``BEGIN IMMEDIATE`` so racing readers see
consistent snapshots.
"""
from __future__ import annotations
import json
import logging
import sqlite3
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
logger = logging.getLogger(__name__)
SCHEMA_VERSION = 1
_SCHEMA_V1 = """
CREATE TABLE IF NOT EXISTS tasks (
id TEXT PRIMARY KEY,
seq INTEGER,
priority INTEGER NOT NULL DEFAULT 0,
goal TEXT NOT NULL,
payload TEXT,
status TEXT NOT NULL DEFAULT 'pending',
worker_id TEXT,
claim_token TEXT,
claimed_at TEXT,
started_at TEXT,
completed_at TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL,
retry_count INTEGER NOT NULL DEFAULT 0,
max_retries INTEGER NOT NULL DEFAULT 3,
last_error TEXT,
parent_task_id TEXT REFERENCES tasks(id) ON DELETE SET NULL,
source TEXT
);
CREATE TABLE IF NOT EXISTS steps (
id TEXT PRIMARY KEY,
task_id TEXT NOT NULL REFERENCES tasks(id) ON DELETE CASCADE,
seq INTEGER NOT NULL,
title TEXT NOT NULL,
detail TEXT,
status TEXT NOT NULL DEFAULT 'pending',
evidence TEXT,
worker_id TEXT,
started_at TEXT,
completed_at TEXT,
UNIQUE (task_id, seq)
);
CREATE TABLE IF NOT EXISTS sop_checklist (
id TEXT PRIMARY KEY,
task_id TEXT NOT NULL REFERENCES tasks(id) ON DELETE CASCADE,
key TEXT NOT NULL,
description TEXT NOT NULL,
required INTEGER NOT NULL DEFAULT 1,
done_at TEXT,
done_by TEXT,
note TEXT,
UNIQUE (task_id, key)
);
CREATE TABLE IF NOT EXISTS colony_meta (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
updated_at TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_tasks_claimable
ON tasks(status, priority DESC, seq, created_at)
WHERE status = 'pending';
CREATE INDEX IF NOT EXISTS idx_steps_task_seq
ON steps(task_id, seq);
CREATE INDEX IF NOT EXISTS idx_sop_required_open
ON sop_checklist(task_id, required, done_at);
CREATE INDEX IF NOT EXISTS idx_tasks_status
ON tasks(status, updated_at);
"""
_PRAGMAS = (
"PRAGMA journal_mode = WAL;",
"PRAGMA synchronous = NORMAL;",
"PRAGMA foreign_keys = ON;",
"PRAGMA busy_timeout = 5000;",
)
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat(timespec="seconds")
def _new_id() -> str:
return str(uuid.uuid4())
def _connect(db_path: Path) -> sqlite3.Connection:
"""Open a connection with the standard pragmas applied.
WAL mode is sticky on the file once set, so re-applying on every
open is cheap. The other pragmas are per-connection and must be
set each time.
"""
con = sqlite3.connect(str(db_path), isolation_level=None, timeout=5.0)
for pragma in _PRAGMAS:
con.execute(pragma)
return con
def ensure_progress_db(colony_dir: Path) -> Path:
"""Create or migrate ``{colony_dir}/data/progress.db``.
Idempotent: safe to call on an already-initialized DB. Returns the
absolute path to the DB file.
Steps:
1. Ensure ``data/`` subdir exists.
2. Open the DB (creates the file if missing).
3. Apply WAL + pragmas.
4. Read ``PRAGMA user_version``; if < SCHEMA_VERSION, run the
schema block and bump user_version.
5. Reclaim any stale claims left from previous runs.
6. Patch every ``*.json`` worker config in the colony dir to
inject ``input_data.db_path`` and ``input_data.colony_id`` so
pre-existing colonies (forked before this feature landed) get
the tracker wiring on their next spawn.
"""
data_dir = Path(colony_dir) / "data"
data_dir.mkdir(parents=True, exist_ok=True)
db_path = data_dir / "progress.db"
con = _connect(db_path)
try:
current_version = con.execute("PRAGMA user_version").fetchone()[0]
if current_version < SCHEMA_VERSION:
con.executescript(_SCHEMA_V1)
con.execute(f"PRAGMA user_version = {SCHEMA_VERSION}")
con.execute(
"INSERT OR REPLACE INTO colony_meta(key, value, updated_at) "
"VALUES (?, ?, ?)",
("schema_version", str(SCHEMA_VERSION), _now_iso()),
)
logger.info(
"progress_db: initialized schema v%d at %s", SCHEMA_VERSION, db_path
)
reclaimed = _reclaim_stale_inner(con, stale_after_minutes=15)
if reclaimed:
logger.info(
"progress_db: reclaimed %d stale claims at startup (%s)",
reclaimed,
db_path,
)
finally:
con.close()
resolved_db_path = db_path.resolve()
_patch_worker_configs(Path(colony_dir), resolved_db_path)
return resolved_db_path
def _patch_worker_configs(colony_dir: Path, db_path: Path) -> int:
"""Inject ``input_data.db_path`` + ``input_data.colony_id`` into
existing ``worker.json`` files in a colony directory.
Runs on every ``ensure_progress_db`` call so colonies that were
forked before this feature landed get their worker spawn messages
patched in place. Idempotent: if ``input_data`` already contains
the correct ``db_path``, the file is not rewritten.
Returns the number of files that were actually modified (0 on
the common case of already-patched colonies).
"""
colony_id = colony_dir.name
abs_db = str(db_path)
patched = 0
for worker_cfg in colony_dir.glob("*.json"):
# Only patch files that look like worker configs (have the
# worker_meta shape). ``metadata.json`` and ``triggers.json``
# are colony-level and must not be touched.
if worker_cfg.name in ("metadata.json", "triggers.json"):
continue
try:
data = json.loads(worker_cfg.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
continue
if not isinstance(data, dict) or "system_prompt" not in data:
# Not a worker config (lacks the worker_meta schema).
continue
input_data = data.get("input_data")
if not isinstance(input_data, dict):
input_data = {}
if (
input_data.get("db_path") == abs_db
and input_data.get("colony_id") == colony_id
):
continue # already patched
input_data["db_path"] = abs_db
input_data["colony_id"] = colony_id
data["input_data"] = input_data
try:
worker_cfg.write_text(
json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8"
)
patched += 1
except OSError as e:
logger.warning(
"progress_db: failed to patch worker config %s: %s", worker_cfg, e
)
if patched:
logger.info(
"progress_db: patched %d worker config(s) in colony '%s' with db_path",
patched,
colony_id,
)
return patched
def ensure_all_colony_dbs(colonies_root: Path | None = None) -> list[Path]:
"""Idempotently ensure every existing colony has a progress.db.
Called on framework host startup to backfill older colonies and
run the stale-claim reclaimer on all of them in one pass.
"""
if colonies_root is None:
colonies_root = Path.home() / ".hive" / "colonies"
if not colonies_root.is_dir():
return []
initialized: list[Path] = []
for entry in sorted(colonies_root.iterdir()):
if not entry.is_dir():
continue
try:
initialized.append(ensure_progress_db(entry))
except Exception as e:
logger.warning(
"progress_db: failed to ensure DB for colony '%s': %s", entry.name, e
)
return initialized
def seed_tasks(
db_path: Path,
tasks: list[dict[str, Any]],
*,
source: str = "queen_create",
) -> list[str]:
"""Bulk-insert tasks (with optional nested steps + sop_items).
Each task dict accepts:
- goal: str (required)
- seq: int (optional ordering hint)
- priority: int (default 0)
- payload: dict | str | None (stored as JSON text)
- max_retries: int (default 3)
- parent_task_id: str | None
- steps: list[{"title": str, "detail"?: str}] (optional)
- sop_items: list[{"key": str, "description": str, "required"?: bool, "note"?: str}] (optional)
All rows are inserted in a single BEGIN IMMEDIATE transaction so
10k-row seeds finish in one disk flush. Returns the created task ids
in the same order as input.
"""
if not tasks:
return []
created_ids: list[str] = []
now = _now_iso()
con = _connect(Path(db_path))
try:
con.execute("BEGIN IMMEDIATE")
for idx, task in enumerate(tasks):
goal = task.get("goal")
if not goal:
raise ValueError(f"task[{idx}] missing required 'goal' field")
task_id = task.get("id") or _new_id()
payload = task.get("payload")
if payload is not None and not isinstance(payload, str):
payload = json.dumps(payload, ensure_ascii=False)
con.execute(
"""
INSERT INTO tasks (
id, seq, priority, goal, payload, status,
created_at, updated_at, max_retries, parent_task_id, source
) VALUES (?, ?, ?, ?, ?, 'pending', ?, ?, ?, ?, ?)
""",
(
task_id,
task.get("seq"),
int(task.get("priority", 0)),
goal,
payload,
now,
now,
int(task.get("max_retries", 3)),
task.get("parent_task_id"),
source,
),
)
for step_seq, step in enumerate(task.get("steps") or [], start=1):
if not step.get("title"):
raise ValueError(
f"task[{idx}].steps[{step_seq - 1}] missing required 'title'"
)
con.execute(
"""
INSERT INTO steps (id, task_id, seq, title, detail, status)
VALUES (?, ?, ?, ?, ?, 'pending')
""",
(
_new_id(),
task_id,
step.get("seq", step_seq),
step["title"],
step.get("detail"),
),
)
for sop in task.get("sop_items") or []:
key = sop.get("key")
description = sop.get("description")
if not key or not description:
raise ValueError(
f"task[{idx}].sop_items missing 'key' or 'description'"
)
con.execute(
"""
INSERT INTO sop_checklist
(id, task_id, key, description, required, note)
VALUES (?, ?, ?, ?, ?, ?)
""",
(
_new_id(),
task_id,
key,
description,
1 if sop.get("required", True) else 0,
sop.get("note"),
),
)
created_ids.append(task_id)
con.execute("COMMIT")
except Exception:
con.execute("ROLLBACK")
raise
finally:
con.close()
return created_ids
def enqueue_task(
db_path: Path,
goal: str,
*,
steps: list[dict[str, Any]] | None = None,
sop_items: list[dict[str, Any]] | None = None,
payload: Any = None,
priority: int = 0,
parent_task_id: str | None = None,
source: str = "enqueue_tool",
) -> str:
"""Append a single task to an existing queue. Thin wrapper over seed_tasks."""
ids = seed_tasks(
db_path,
[
{
"goal": goal,
"steps": steps,
"sop_items": sop_items,
"payload": payload,
"priority": priority,
"parent_task_id": parent_task_id,
}
],
source=source,
)
return ids[0]
def _reclaim_stale_inner(
con: sqlite3.Connection, *, stale_after_minutes: int
) -> int:
"""Reclaim stale claims. Runs inside an existing open connection.
Two-step:
1. Tasks past max_retries go to 'failed' with last_error populated.
2. Remaining stale claims return to 'pending', retry_count++.
"""
cutoff_expr = f"datetime('now', '-{int(stale_after_minutes)} minutes')"
con.execute("BEGIN IMMEDIATE")
try:
con.execute(
f"""
UPDATE tasks
SET status = 'failed',
last_error = COALESCE(last_error, 'exceeded max_retries after stale claim'),
completed_at = datetime('now'),
updated_at = datetime('now')
WHERE status IN ('claimed', 'in_progress')
AND claimed_at IS NOT NULL
AND claimed_at < {cutoff_expr}
AND retry_count >= max_retries
"""
)
cur = con.execute(
f"""
UPDATE tasks
SET status = 'pending',
worker_id = NULL,
claim_token = NULL,
claimed_at = NULL,
started_at = NULL,
retry_count = retry_count + 1,
updated_at = datetime('now')
WHERE status IN ('claimed', 'in_progress')
AND claimed_at IS NOT NULL
AND claimed_at < {cutoff_expr}
AND retry_count < max_retries
"""
)
reclaimed = cur.rowcount or 0
con.execute("COMMIT")
return reclaimed
except Exception:
con.execute("ROLLBACK")
raise
def reclaim_stale(db_path: Path, stale_after_minutes: int = 15) -> int:
"""Public wrapper that opens its own connection."""
con = _connect(Path(db_path))
try:
return _reclaim_stale_inner(con, stale_after_minutes=stale_after_minutes)
finally:
con.close()
__all__ = [
"SCHEMA_VERSION",
"ensure_progress_db",
"ensure_all_colony_dbs",
"seed_tasks",
"enqueue_task",
"reclaim_stale",
]
+12 -1
View File
@@ -1404,7 +1404,18 @@ class AgentLoader:
credential_store=credential_store,
)
runner._agent_default_skills = None
runner._agent_skills = None
# Colony workers attached to a SQLite task queue get the
# colony-progress-tracker skill pre-activated so its full
# claim / step / SOP-gate protocol lands in the system prompt
# on turn 0, bypassing the progressive-disclosure catalog
# lookup. Triggered by the presence of ``input_data.db_path``
# in worker.json (written by fork_session_into_colony and
# backfilled by ensure_progress_db for pre-existing colonies).
_preactivate: list[str] = []
_input_data = first_worker.get("input_data") or {}
if isinstance(_input_data, dict) and _input_data.get("db_path"):
_preactivate.append("hive.colony-progress-tracker")
runner._agent_skills = _preactivate or None
return runner
def register_tool(
+11 -1
View File
@@ -497,12 +497,22 @@ class ToolRegistry:
config["cwd"] = str(resolved_cwd)
return config
# For coder_tools_server, inject --project-root so writes go to the expected workspace
# For coder_tools_server, inject --project-root so reads land
# in the expected workspace (hive repo, for framework skills
# and docs), and inject --write-root so writes land under
# ~/.hive/workspace/ instead of polluting the git checkout
# with queen-authored skills, ledgers, and scripts. Without
# the split, every ``write_file`` call from the queen landed
# in the hive repo root.
if script_name and "coder_tools" in script_name:
project_root = str(resolved_cwd.parent.resolve())
args = list(args)
if "--project-root" not in args:
args.extend(["--project-root", project_root])
if "--write-root" not in args:
_write_root = Path.home() / ".hive" / "workspace"
_write_root.mkdir(parents=True, exist_ok=True)
args.extend(["--write-root", str(_write_root)])
config["args"] = args
if os.name == "nt":
+22 -9
View File
@@ -51,13 +51,18 @@ DEFAULT_EVENT_TYPES = [
# Keepalive interval in seconds
KEEPALIVE_INTERVAL = 15.0
# Phase 5 SSE filter: parallel-worker streams (stream_id="worker:{uuid}")
# publish high-frequency LLM deltas / tool calls that would flood the
# user's queen DM chat. We let only this small allowlist of worker
# events through to the queen-chat SSE so the frontend can render
# fan-out lifecycle and structured fan-in reports without seeing the
# raw worker chatter. Per-worker SSE panels (Phase 5b) bypass this
# filter via a dedicated /workers/{worker_id}/events route.
# Session-SSE worker filter: workers run outside the queen's DM
# chat. Worker activity is observable via the dedicated
# ``/api/workers/{worker_id}/events`` per-worker SSE route, not via
# the session chat. This keeps the queen↔user conversation clean of
# tool-call chatter regardless of whether the worker was spawned by
# ``run_agent_with_input`` (stream_id="worker") or
# ``run_parallel_workers`` (stream_id="worker:{uuid}").
#
# Lifecycle events the frontend needs for fan-in summaries
# (SUBAGENT_REPORT, EXECUTION_COMPLETED, EXECUTION_FAILED) are still
# allowed through so the queen can show "N workers done" surfaces
# without exposing the per-turn chatter.
_WORKER_EVENT_ALLOWLIST = {
EventType.SUBAGENT_REPORT.value,
EventType.EXECUTION_COMPLETED.value,
@@ -66,9 +71,17 @@ _WORKER_EVENT_ALLOWLIST = {
def _is_worker_noise(evt_dict: dict) -> bool:
"""True if the event is a parallel-worker event we should drop."""
"""True if the event belongs to a worker stream and should not
surface in the queen DM chat.
Matches any stream starting with ``worker`` both the bare
``"worker"`` tag used by single-worker spawns and the
``"worker:{uuid}"`` tag used by parallel fan-outs. The allowlist
carves out the three terminal/lifecycle events the UI still
needs to render fan-in summaries.
"""
stream_id = evt_dict.get("stream_id") or ""
if not stream_id.startswith("worker:"):
if not stream_id.startswith("worker"):
return False
return evt_dict.get("type") not in _WORKER_EVENT_ALLOWLIST
+73 -2
View File
@@ -644,6 +644,7 @@ async def handle_colony_spawn(request: web.Request) -> web.Response:
body = await request.json()
colony_name = body.get("colony_name", "").strip()
task = body.get("task", "").strip()
tasks = body.get("tasks")
if not colony_name:
return web.json_response({"error": "colony_name is required"}, status=400)
@@ -661,6 +662,7 @@ async def handle_colony_spawn(request: web.Request) -> web.Response:
session=session,
colony_name=colony_name,
task=task,
tasks=tasks if isinstance(tasks, list) else None,
)
except Exception as e:
logger.exception("colony_spawn fork failed")
@@ -674,6 +676,7 @@ async def fork_session_into_colony(
session: Any,
colony_name: str,
task: str,
tasks: list[dict] | None = None,
) -> dict:
"""Fork a queen session into a colony directory.
@@ -690,8 +693,14 @@ async def fork_session_into_colony(
the colony resumes with the queen's entire conversation history.
3. Multiple independent sessions can be created against the same colony,
giving parallel execution capacity without separate worker configs.
4. Initializes (or ensures) ``data/progress.db`` the colony's SQLite
task queue + progress ledger. When *tasks* is provided, the queen-
authored task batch is seeded into the queue in one transaction.
The absolute DB path is threaded into the worker's ``input_data``
so spawned workers see it in their first user message.
Returns ``{"colony_path", "colony_name", "queen_session_id", "is_new"}``.
Returns ``{"colony_path", "colony_name", "queen_session_id", "is_new",
"db_path", "task_ids"}``.
"""
import asyncio
import json
@@ -700,7 +709,8 @@ async def fork_session_into_colony(
from pathlib import Path
from framework.agent_loop.agent_loop import AgentLoop, LoopConfig
from framework.agent_loop.types import AgentContext
from framework.agent_loop.types import AgentContext, AgentSpec
from framework.host.progress_db import ensure_progress_db, seed_tasks
from framework.server.session_manager import _queen_session_dir
queen_loop: AgentLoop = session.queen_executor.node_registry["queen"]
@@ -711,6 +721,49 @@ async def fork_session_into_colony(
colony_dir.mkdir(parents=True, exist_ok=True)
(colony_dir / "data").mkdir(exist_ok=True)
# ── 0. Ensure the colony's progress DB exists and seed tasks ──
# Runs before worker.json is written so the DB path can be threaded
# into input_data. Idempotent on reruns of the same colony name.
db_path = await asyncio.to_thread(ensure_progress_db, colony_dir)
seeded_task_ids: list[str] = []
if tasks:
seeded_task_ids = await asyncio.to_thread(
seed_tasks, db_path, tasks, source="queen_create"
)
logger.info(
"progress_db: seeded %d task(s) into colony '%s'",
len(seeded_task_ids),
colony_name,
)
elif task and task.strip():
# Phase 2 auto-seed: when the queen uses the simple single-task
# form of create_colony (no explicit ``tasks=[{...}]`` list),
# insert exactly one row so the first worker spawned into this
# colony has something to claim. Without this the queue is
# empty and the worker falls back to executing from the chat
# spawn message, defeating the cross-run durability the tracker
# exists for.
try:
seeded_task_ids = await asyncio.to_thread(
seed_tasks,
db_path,
[{"goal": task.strip()}],
source="create_colony_auto",
)
logger.info(
"progress_db: auto-seeded 1 task into colony '%s' "
"(task_id=%s, from single-task create_colony form)",
colony_name,
seeded_task_ids[0] if seeded_task_ids else "?",
)
except Exception as exc:
logger.warning(
"progress_db: auto-seed failed for colony '%s' (continuing "
"without a pre-seeded row): %s",
colony_name,
exc,
)
# Fixed worker name -- sessions are the unit of parallelism, not workers
worker_name = "worker"
@@ -772,10 +825,26 @@ async def fork_session_into_colony(
# worker is not Charlotte / Alexandra / etc., it is a task executor.
# Inheriting the queen's persona made the worker greet the user in
# first person with no memory of the task it was actually given.
# Thread the first seeded task_id into input_data so the worker's
# first claim pins to a specific row (skill's assigned-task-id
# branch). When multiple tasks were seeded we only pin the first —
# subsequent workers (via run_agent_with_input or parallel spawns)
# get their own task_id assigned at spawn time.
_worker_input_data: dict[str, Any] = {
"db_path": str(db_path),
"colony_id": colony_name,
}
if seeded_task_ids:
_worker_input_data["task_id"] = seeded_task_ids[0]
worker_meta = {
"name": worker_name,
"version": "1.0.0",
"description": f"Worker clone from queen session {session.id}",
# Colony progress tracker: worker sees these in its first user
# message via _format_spawn_task_message. The colony-progress-
# tracker default skill teaches the worker how to use them.
"input_data": _worker_input_data,
"goal": {
"description": worker_task,
"success_criteria": [],
@@ -907,6 +976,8 @@ async def fork_session_into_colony(
"colony_name": colony_name,
"queen_session_id": colony_session_id,
"is_new": is_new,
"db_path": str(db_path),
"task_ids": seeded_task_ids,
}
+71 -5
View File
@@ -686,6 +686,10 @@ async def handle_session_colonies(request: web.Request) -> web.Response:
return web.json_response({"colonies": colonies})
_EVENTS_HISTORY_DEFAULT_LIMIT = 2000
_EVENTS_HISTORY_MAX_LIMIT = 10000
async def handle_session_events_history(request: web.Request) -> web.Response:
"""GET /api/sessions/{session_id}/events/history — persisted eventbus log.
@@ -693,17 +697,58 @@ async def handle_session_events_history(request: web.Request) -> web.Response:
both live sessions and cold (post-server-restart) sessions. The frontend
replays these events through ``sseEventToChatMessage`` to fully reconstruct
the UI state on resume.
Query params:
limit: maximum number of events to return (default 2000, max 10000).
The TAIL of the file is returned i.e. the most recent N events.
Older events are dropped and ``truncated`` is set to True.
Response shape::
{
"events": [...], # up to ``limit`` events, oldest-first
"session_id": "...",
"total": 12345, # total events in the file
"returned": 2000, # len(events)
"truncated": true, # total > returned
"limit": 2000, # the effective limit used
}
``events.jsonl`` is append-only chronological, so "last N lines" == "most
recent N events". Long-running colonies have produced files with 50k+
events; before this cap, restoring on page-mount shipped the whole thing
down the wire and blocked the UI for seconds.
"""
session_id = request.match_info["session_id"]
try:
limit = int(request.query.get("limit", str(_EVENTS_HISTORY_DEFAULT_LIMIT)))
except ValueError:
limit = _EVENTS_HISTORY_DEFAULT_LIMIT
limit = max(1, min(limit, _EVENTS_HISTORY_MAX_LIMIT))
from framework.server.session_manager import _find_queen_session_dir
queen_dir = _find_queen_session_dir(session_id)
events_path = queen_dir / "events.jsonl"
if not events_path.exists():
return web.json_response({"events": [], "session_id": session_id})
return web.json_response(
{
"events": [],
"session_id": session_id,
"total": 0,
"returned": 0,
"truncated": False,
"limit": limit,
}
)
events: list[dict] = []
# Tail the file using a bounded deque — O(limit) memory regardless
# of file size. No need to materialize the whole list only to slice it.
from collections import deque
tail: deque[dict] = deque(maxlen=limit)
total = 0
try:
with open(events_path, encoding="utf-8") as f:
for line in f:
@@ -711,13 +756,34 @@ async def handle_session_events_history(request: web.Request) -> web.Response:
if not line:
continue
try:
events.append(json.loads(line))
evt = json.loads(line)
except json.JSONDecodeError:
continue
total += 1
tail.append(evt)
except OSError:
return web.json_response({"events": [], "session_id": session_id})
return web.json_response(
{
"events": [],
"session_id": session_id,
"total": 0,
"returned": 0,
"truncated": False,
"limit": limit,
}
)
return web.json_response({"events": events, "session_id": session_id})
events = list(tail)
return web.json_response(
{
"events": events,
"session_id": session_id,
"total": total,
"returned": len(events),
"truncated": total > len(events),
"limit": limit,
}
)
async def handle_session_history(request: web.Request) -> web.Response:
+18
View File
@@ -139,6 +139,24 @@ class SessionManager:
except Exception:
logger.warning("v2 migration failed (non-fatal)", exc_info=True)
# Ensure every existing colony has an up-to-date progress.db
# (schema v1, WAL mode) and reclaim any stale claims left behind
# by crashed workers from the previous run. Idempotent and
# fast; runs synchronously because the event loop hasn't
# started yet at __init__ time.
from framework.host.progress_db import ensure_all_colony_dbs
try:
ensured = ensure_all_colony_dbs()
if ensured:
logger.info(
"progress_db: ensured %d colony DB(s) at startup", len(ensured)
)
except Exception:
logger.warning(
"progress_db: backfill at startup failed (non-fatal)", exc_info=True
)
def build_llm(self, model: str | None = None):
"""Construct an LLM provider using the server's configured defaults."""
from framework.config import RuntimeConfig, get_hive_config
@@ -1,24 +0,0 @@
---
name: hive.batch-ledger
description: Track per-item status when processing collections to prevent skipped or duplicated items.
metadata:
author: hive
type: default-skill
---
## Operational Protocol: Batch Progress Ledger
When processing a collection of items, maintain a batch ledger in `_batch_ledger`.
Initialize when you identify the batch:
- `_batch_total`: total item count
- `_batch_ledger`: JSON with per-item status
Per-item statuses: pending → in_progress → completed|failed|skipped
- Set `in_progress` BEFORE processing
- Set final status AFTER processing with 1-line result_summary
- Include error reason for failed/skipped items
- Update aggregate counts after each item
- NEVER remove items from the ledger
- If resuming, skip items already marked completed
@@ -61,6 +61,7 @@ Whereas `wait_for_selector`, `browser_click(selector=...)`, `browser_type(select
### Empirically verified (2026-04-11)
Tested against `https://www.reddit.com/r/programming/` whose search input lives at:
```
document > reddit-search-large [shadow]
> faceplate-search-input#search-input [shadow]
@@ -95,13 +96,13 @@ All return real URLs and titles. On a fast page `navigate(wait_until="load")` re
### Timing expectations (measured against real sites)
| Site | Navigate load time |
|---|---|
| example.com | 100400 ms |
| wikipedia.org | 200500 ms |
| reddit.com | 1.52 s |
| x.com/twitter | 1.21.6 s |
| linkedin.com (logged in) | 45 s |
| Site | Navigate load time |
| ------------------------ | ------------------ |
| example.com | 100400 ms |
| wikipedia.org | 200500 ms |
| reddit.com | 1.52 s |
| x.com/twitter | 1.21.6 s |
| linkedin.com (logged in) | 45 s |
For LinkedIn and other heavy SPAs, rely on `sleep()` after navigation to let the page hydrate.
@@ -124,7 +125,7 @@ Even after `wait_until="load"`, React/Vue SPAs often render their real chrome in
Why this is necessary:
- **React / Vue controlled components** don't trust JS-sourced `.focus()`. React uses event delegation and watches for *native* pointer/focus events — a `click` dispatched via CDP fires the real `pointerdown`/`pointerup`/`click`/`focus` sequence that React listens to, and updates its internal state. A JS-only `.focus()` sets `document.activeElement` but the framework's controlled state doesn't see it.
- **React / Vue controlled components** don't trust JS-sourced `.focus()`. React uses event delegation and watches for _native_ pointer/focus events — a `click` dispatched via CDP fires the real `pointerdown`/`pointerup`/`click`/`focus` sequence that React listens to, and updates its internal state. A JS-only `.focus()` sets `document.activeElement` but the framework's controlled state doesn't see it.
- **Draft.js** (X/Twitter compose) and **Lexical** (Gmail, LinkedIn DMs) use contenteditable divs with immutable editor state. They only enter "edit mode" after a real click on the editor surface. Typing at them without clicking routes keys to `document.body` or gets silently discarded.
- **Send/submit buttons are bound to framework state**, not DOM state. They're typically `disabled={!hasRealContent}` where `hasRealContent` is computed from React/Vue/Svelte state. The input field can have characters in the DOM but the button stays disabled because the framework never saw a real input event.
@@ -171,16 +172,16 @@ Always include an equivalent cleanup block in any script that types into a compo
### Verified site-specific quirks
| Site | Editor | Workaround |
|---|---|---|
| **X / Twitter** compose | Draft.js | Click `[data-testid='tweetTextarea_0']` first, then type with `delay_ms=20`. First 1-2 chars may be eaten — accept truncation or prepend a throwaway char. Verify `[data-testid='tweetButton']` has `disabled: false` before clicking. |
| **LinkedIn** messaging | contenteditable (inside `#interop-outlet` shadow root) | Use `browser_shadow_query` to find the rect, click-coordinate to focus, then `browser_type_focused(text=...)` (selector-based `browser_type` can't reach shadow). Send button is `.msg-form__send-button`. |
| **LinkedIn** feed post composer | Quill/LinkedIn custom | Click the "Start a post" trigger first, wait 1s for modal, click the textarea, type. |
| **Reddit** comment/post box | ProseMirror | Click the textarea, wait 0.5s for the toolbar to mount, then type. Submit is `button[slot="submit-button"]` inside a shreddit-composer. |
| **Gmail** compose | Lexical | Click the body first. Gmail has a visible `div[contenteditable=true][aria-label*='Message Body']` after opening a compose window. |
| **Slack** message box | contenteditable | Click first, then type. Send is a paper-plane button with `data-qa='texty_send_button'`. |
| **Discord** | Slate | Click first. Discord's send is implicit on Enter (no button), so just press Enter after typing. |
| **Monaco** editors (GitHub code review, CodeSandbox) | Monaco | Click first, type with `delay_ms=10`. Monaco listens for `textarea` input events on a hidden textarea — requires focus to be on that textarea. |
| Site | Editor | Workaround |
| ---------------------------------------------------- | ------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **X / Twitter** compose | Draft.js | Click `[data-testid='tweetTextarea_0']` first, then type with `delay_ms=20`. First 1-2 chars may be eaten — accept truncation or prepend a throwaway char. Verify `[data-testid='tweetButton']` has `disabled: false` before clicking. |
| **LinkedIn** messaging | contenteditable (inside `#interop-outlet` shadow root) | Use `browser_shadow_query` to find the rect, click-coordinate to focus, then `browser_type_focused(text=...)` (selector-based `browser_type` can't reach shadow). Send button is `.msg-form__send-button`. |
| **LinkedIn** feed post composer | Quill/LinkedIn custom | Click the "Start a post" trigger first, wait 1s for modal, click the textarea, type. |
| **Reddit** comment/post box | ProseMirror | Click the textarea, wait 0.5s for the toolbar to mount, then type. Submit is `button[slot="submit-button"]` inside a shreddit-composer. |
| **Gmail** compose | Lexical | Click the body first. Gmail has a visible `div[contenteditable=true][aria-label*='Message Body']` after opening a compose window. |
| **Slack** message box | contenteditable | Click first, then type. Send is a paper-plane button with `data-qa='texty_send_button'`. |
| **Discord** | Slate | Click first. Discord's send is implicit on Enter (no button), so just press Enter after typing. |
| **Monaco** editors (GitHub code review, CodeSandbox) | Monaco | Click first, type with `delay_ms=10`. Monaco listens for `textarea` input events on a hidden textarea — requires focus to be on that textarea. |
### Plain text into a real input
@@ -247,6 +248,7 @@ The highlight overlay stays visible on the page for **10 seconds** after each in
- Popup appeared that you didn't need? Close it immediately
`browser_tabs` returns an `origin` field for each tab:
- `"agent"` — you opened it; you own it; close it when done
- `"popup"` — opened by a link or script; close after extracting what you need
- `"startup"` or `"user"` — leave these alone unless the task requires it
@@ -259,22 +261,22 @@ The bridge automatically evicts per-tab state (`_cdp_attached`, `_interaction_hi
### LinkedIn
| Target | Selector |
|---|---|
| Global search input | `input[data-testid='typeahead-input']` |
| Own profile link | `a[href*='linkedin.com/in/']` |
| Messaging overlay | `#interop-outlet >>> [aria-label]` (use shadow_query) |
| Target | Selector |
| ------------------- | ----------------------------------------------------- |
| Global search input | `input[data-testid='typeahead-input']` |
| Own profile link | `a[href*='linkedin.com/in/']` |
| Messaging overlay | `#interop-outlet >>> [aria-label]` (use shadow_query) |
LinkedIn enforces **strict Trusted Types CSP**. Any script you inject via `browser_evaluate` that uses `innerHTML = "<...>"` will be **silently dropped** — the wrapper element gets added but its content is empty, no console error. Always use `createElement` + `appendChild` + `setAttribute` for DOM injection on LinkedIn. `style.cssText`, `textContent`, and `.value` assignments are fine (they don't go through the Trusted Types sink).
### Reddit (new reddit / shreddit)
| Target | Selector |
|---|---|
| Target | Selector |
| --------------------- | ---------------------------------------------------------------------------- |
| Search input (shadow) | `reddit-search-large >>> #search-input` (rect only; type via click-to-focus) |
| Reddit logo (home) | `#reddit-logo` |
| Subreddit posts | `shreddit-post` custom elements |
| Create post button | `a[href*='/submit']` |
| Reddit logo (home) | `#reddit-logo` |
| Subreddit posts | `shreddit-post` custom elements |
| Create post button | `a[href*='/submit']` |
Reddit's search input lives **two shadow levels deep** inside `reddit-search-large > faceplate-search-input`. You cannot reach it with `browser_type(selector=)`. The working pattern:
@@ -285,15 +287,15 @@ Reddit's search input lives **two shadow levels deep** inside `reddit-search-lar
### X / Twitter
| Target | Selector |
|---|---|
| Main search input | `input[data-testid='SearchBox_Search_Input']` |
| Home nav link | `a[data-testid='AppTabBar_Home_Link']` |
| Post text area (compose) | `[data-testid='tweetTextarea_0']` |
| Reply buttons on feed | `[data-testid='reply']` |
| Post / Tweet submit button | `[data-testid='tweetButton']` |
| Caret (⋯) menu on a post | `[data-testid='caret']` |
| Confirmation sheet button | `[data-testid='confirmationSheetConfirm']` |
| Target | Selector |
| -------------------------- | --------------------------------------------- |
| Main search input | `input[data-testid='SearchBox_Search_Input']` |
| Home nav link | `a[data-testid='AppTabBar_Home_Link']` |
| Post text area (compose) | `[data-testid='tweetTextarea_0']` |
| Reply buttons on feed | `[data-testid='reply']` |
| Post / Tweet submit button | `[data-testid='tweetButton']` |
| Caret (⋯) menu on a post | `[data-testid='caret']` |
| Confirmation sheet button | `[data-testid='confirmationSheetConfirm']` |
**X uses Draft.js for the compose text editor**, which does NOT accept synthetic input reliably. Working workaround: `browser_type(selector='[data-testid="tweetTextarea_0"]', text="...", delay_ms=20)`. The delay gives Draft.js time to process each keystroke. The first 12 characters may still get eaten — accept minor truncation or prepend a throwaway character. After typing, check `[data-testid="tweetButton"]` has `disabled: false` before clicking submit.
@@ -366,17 +368,35 @@ If Chrome detaches the debugger for its own reasons (tab closed, user opened Dev
If reattach also fails, you'll get the underlying CDP error string — that's a real problem, usually the tab is gone.
## When to reach for `browser_evaluate`
## `browser_evaluate` is a last-resort escape hatch
Use it when:
- You need to read state from inside a shadow root that `browser_get_rect` doesn't handle
- You need a one-shot JS snippet to trigger a site-specific action (scroll a specific container, open a menu, set a form field value directly)
- You need to walk an AX tree or measure layout that the standard tools don't expose
**Before using `browser_evaluate`, try these first — in this order:**
Avoid it when:
- A standard tool (`browser_click_coordinate`, `browser_type`, `browser_press`) already does what you need. Those go through CDP's native event pipeline, which real sites trust more than synthetic JS dispatch.
- You're on a strict-CSP site and want to inject DOM — stick to `createElement` + `appendChild`, never `innerHTML`.
- You need to trigger React / Vue / framework state changes — those frameworks watch for real browser events (`input`, `change`, `click`), not scripted `dispatchEvent` calls. Native-event tools are more reliable.
1. **`browser_screenshot` + `browser_click_coordinate`** — works on every site regardless of shadow DOM, iframes, obfuscated classes. This is the default path for "click a thing you can see."
2. **`browser_type(use_insert_text=True, text=...)`** — for typing into ANY input/contenteditable, including Lexical and Draft.js. Handles click-focus-insert with built-in retries. Do **not** call `document.execCommand('insertText')` via evaluate; this tool already does it correctly.
3. **`browser_shadow_query`** or **`browser_get_rect(selector)`** with the `>>>` shadow-piercing syntax — for selector-based lookups across shadow roots.
4. **`browser_get_text` / `browser_get_attribute`** — for reading element state by selector.
5. **`browser_snapshot`** — for dumping the accessibility tree of the page.
If all five of those fit your goal, **do not use `browser_evaluate`.** Each evaluate call is a small LLM round-trip of ~30-100 tokens of JS plus a JSON response; five of them burn more context than a single screenshot-and-coordinate does, with less reliability.
### Anti-patterns — stop immediately if you catch yourself doing these
- **Trying multiple `querySelectorAll` variants when the first returned `[]`.** Different selectors on the same page rarely work if the first guess failed — modern SPAs obfuscate class names at build time. After one empty result, switch to `browser_screenshot` + `browser_click_coordinate`. Do not write `.artdeco-list__item`, then `[data-test-incoming-invitation-card]`, then `[class*="invitation"]` — you are already on the wrong path.
- **Writing `walk(root)` recursive shadow-DOM traversal functions.** Use `browser_shadow_query` — it traverses at the CDP level (native C++), not by re-running a recursive JS function every call.
- **Calling `document.execCommand('insertText', ...)` to type into a contenteditable.** Use `browser_type(use_insert_text=True, text='...')`. The high-level tool handles the exact same Lexical/Draft.js case but with click-focus-retry logic built in.
- **Accessing `iframe.contentDocument`.** Rarely works (cross-origin, late hydration) and when it does, the code is brittle. Use `browser_screenshot` to see the iframe, then `browser_click_coordinate` to interact.
- **Using `innerHTML = "<...>"` on a Trusted Types site (LinkedIn, GitHub).** The assignment is silently dropped. Use `createElement` + `appendChild` if you must inject DOM — but first, ask whether you really need to.
- **Triggering React/Vue state via synthetic `dispatchEvent`.** Frameworks watch for real browser events. Use `browser_click_coordinate`, `browser_press`, or `browser_type` — all go through CDP's native event pipeline.
### Legitimate uses (when nothing semantic fits)
- Reading a computed style, `window.innerWidth/Height`, `document.scrollingElement.scrollTop`, or other layout values the tools don't expose.
- Firing a one-shot site-specific API call (analytics beacon, feature-flag toggle).
- Stripping `onbeforeunload` before navigating away from a page with an unsent draft (LinkedIn, Gmail).
- Detecting whether a specific shadow-root host exists before a follow-up screenshot.
In all of these cases the script is SHORT (< 10 lines) and the result is CONSUMED (read, then acted on), not further probed.
## Login & auth walls
@@ -0,0 +1,111 @@
---
name: hive.colony-progress-tracker
description: Claim tasks, record step progress, and verify SOP gates in the colony SQLite queue. Applies when your spawn message includes a db_path field.
metadata:
author: hive
type: default-skill
---
## Operational Protocol: Colony Progress Tracker
**Applies when** your spawn message has `db_path:` and `colony_id:` fields. The DB is your durable working memory — tells you what's done, what to skip, which SOP gates you owe.
Access via `execute_command_tool` running `sqlite3 "<db_path>" "..."`. Tables: `tasks` (queue), `steps` (per-task decomposition), `sop_checklist` (hard gates).
### Claim: assigned task (check this FIRST)
If your spawn message includes a `task_id:` field, the queen pre-assigned a specific row to you. Claim that row by id — **do not** use the generic next-pending pattern below:
```bash
sqlite3 "<db_path>" <<'SQL'
UPDATE tasks SET status='claimed', worker_id='<worker-id>',
claim_token=lower(hex(randomblob(8))),
claimed_at=datetime('now'), updated_at=datetime('now')
WHERE id='<task_id>' AND status='pending'
RETURNING id, goal, payload;
SQL
```
Empty output → another worker raced you or the row is already done. Stop and report. Non-empty → that row is yours, proceed to "Load the plan".
### Claim: next pending (fallback when no task_id is assigned)
If your spawn message did NOT include `task_id:` — you are a generic fan-out worker racing on a shared queue. Use the generic next-pending claim:
```bash
sqlite3 "<db_path>" <<'SQL'
UPDATE tasks SET status='claimed', worker_id='<worker-id>',
claim_token=lower(hex(randomblob(8))),
claimed_at=datetime('now'), updated_at=datetime('now')
WHERE id=(SELECT id FROM tasks WHERE status='pending'
ORDER BY priority DESC, seq, created_at LIMIT 1)
RETURNING id, goal, payload;
SQL
```
Empty output → queue drained, exit. Otherwise the returned `id` is yours. **Never SELECT-then-UPDATE** — races.
### Load the plan
```bash
sqlite3 "<db_path>" "SELECT seq, id, title, status FROM steps WHERE task_id='<task-id>' ORDER BY seq;"
sqlite3 "<db_path>" "SELECT key, description, required, done_at FROM sop_checklist WHERE task_id='<task-id>';"
```
**Skip any step where status='done'.** That's the point — don't redo completed work.
### Execute a step
Before tool calls:
```bash
sqlite3 "<db_path>" "UPDATE steps SET status='in_progress', worker_id='<worker-id>', started_at=datetime('now') WHERE id='<step-id>';"
```
After success (one-line evidence: path, URL, key result):
```bash
sqlite3 "<db_path>" "UPDATE steps SET status='done', evidence='<what you did>', completed_at=datetime('now') WHERE id='<step-id>';"
```
### MANDATORY: SOP gate check before marking task done
```bash
sqlite3 "<db_path>" "SELECT key, description FROM sop_checklist WHERE task_id='<task-id>' AND required=1 AND done_at IS NULL;"
```
- Empty → proceed to "Mark task done".
- Non-empty → each row is work you still owe. Do it, then check it off:
```bash
sqlite3 "<db_path>" "UPDATE sop_checklist SET done_at=datetime('now'), done_by='<worker-id>', note='<why>' WHERE task_id='<task-id>' AND key='<key>';"
```
**Never mark a task done while this SELECT returns rows.** This gate exists specifically to stop you from declaring success while skipping required steps.
### Mark task done / failed
```bash
# Success:
sqlite3 "<db_path>" "UPDATE tasks SET status='done', completed_at=datetime('now'), updated_at=datetime('now') WHERE id='<task-id>' AND worker_id='<worker-id>';"
# Unrecoverable failure:
sqlite3 "<db_path>" "UPDATE tasks SET status='failed', last_error='<one sentence>', completed_at=datetime('now'), updated_at=datetime('now') WHERE id='<task-id>' AND worker_id='<worker-id>';"
```
The `AND worker_id=?` guard means a reclaimed row won't accept your write — treat zero rows affected as "your claim was revoked, stop."
### Loop
After done/failed → claim the next task. Exit only when claim returns empty.
### Errors + debug
- **"database is locked"**: retry with 100ms → 1s backoff, max 5 attempts. `busy_timeout=5000` handles most contention silently.
- **Queue health**: `SELECT status, count(*) FROM tasks GROUP BY status;`
- **Your in-flight work**: `SELECT id, goal, status FROM tasks WHERE worker_id='<worker-id>';`
### Anti-patterns (will break the queue)
- Don't DDL (CREATE/ALTER/DROP).
- Don't DELETE — failed tasks stay as `failed` for audit.
- Don't skip Protocol 4 (SOP gate) before marking done.
- Don't hold a task >15min without updates — the stale-claim reclaimer revokes your claim.
- Don't invent task IDs. Workers update existing rows; only the queen enqueues new ones.
@@ -1,6 +1,6 @@
---
name: hive.context-preservation
description: Proactively preserve critical information before automatic context pruning destroys it.
description: Proactively extract critical values from tool results into working notes before automatic context pruning destroys them.
metadata:
author: hive
type: default-skill
@@ -8,17 +8,16 @@ metadata:
## Operational Protocol: Context Preservation
You operate under a finite context window. Important information WILL be pruned.
You operate under a finite context window. Older tool results WILL be pruned. Extract what you need while it's still in context.
Save-As-You-Go: After any tool call producing information you'll need later,
immediately extract key data into `_working_notes` or `_preserved_data`.
Do NOT rely on referring back to old tool results.
**Save-as-you-go.** After any tool call producing information you'll need later, immediately extract the key data into `_working_notes` or `_preserved_data`. Do not rely on referring back to old tool results — once they're pruned they're gone.
What to extract: URLs and key snippets (not full pages), relevant API fields
(not raw JSON), specific lines/values (not entire files), analysis results
(not raw data).
**What to extract:**
- URLs and key snippets (not full pages)
- Relevant API fields (not raw JSON blobs)
- Specific lines, values, or IDs (not entire files)
- Analysis conclusions (not raw data)
Before transitioning to the next phase/node, write a handoff summary to
`_handoff_context` with everything the next phase needs to know.
**Handoffs between tasks** happen through `progress.db`, not through shared-buffer handoff blobs. When you finish a task, any state the next worker needs goes into the task row itself (`steps.evidence`, `tasks.last_error`, `sop_checklist.note`) — see `hive.colony-progress-tracker`. Use `_working_notes` for things the DB schema doesn't cover.
You will receive an alert when context reaches {{warn_at_usage_ratio_pct}}% — preserve immediately.
@@ -1,6 +1,6 @@
---
name: hive.error-recovery
description: Follow a structured recovery protocol when tool calls fail instead of blindly retrying or giving up.
description: Follow a structured recovery decision tree when tool calls fail instead of blindly retrying or giving up.
metadata:
author: hive
type: default-skill
@@ -10,9 +10,20 @@ metadata:
When a tool call fails:
1. Diagnose — record error in notes, classify as transient or structural
2. Decide — transient: retry once. Structural fixable: fix and retry.
Structural unfixable: record as failed, move to next item.
Blocking all progress: record escalation note.
3. Adapt — if same tool failed {{max_retries_per_tool}}+ times, stop using it and find alternative.
Update plan in notes. Never silently drop the failed item.
1. **Diagnose** — classify the failure as *transient* (network blip, rate limit, timeout) or *structural* (wrong selector, missing auth, invalid schema, permission denied).
2. **Decide:**
- Transient → retry once.
- Structural + fixable → fix the input and retry.
- Structural + unfixable → record the failure and move to the next item.
- Blocking all progress → escalate.
3. **Adapt** — if the same tool has failed {{max_retries_per_tool}}+ times in a row, stop using it and find an alternative approach.
**Never silently drop a failed item.** If the item is a task in the colony queue, write the failure to the DB instead of an in-memory buffer:
```bash
sqlite3 "$DB_PATH" "UPDATE tasks SET status='failed', last_error='<one-sentence reason>', completed_at=datetime('now'), updated_at=datetime('now') WHERE id='<task-id>' AND worker_id='<your-worker-id>';"
```
The `tasks.retry_count` column and the stale-claim reclaimer handle auto-retry for crashes; your job is the within-run decision tree above. See `hive.colony-progress-tracker` for the full queue protocol.
@@ -15,6 +15,28 @@ LinkedIn is the hardest mainstream site to automate because it combines **shadow
**Always activate `browser-automation` first.** This skill assumes you already know about CSS-px coordinates, `browser_type`/`browser_type_focused`, and `browser_shadow_query`. The guidance below is LinkedIn-specific; general browser rules are there.
## Rule #0: screenshot + coordinates, not selectors
LinkedIn changes class names aggressively and hides composers inside shadow roots AND iframes. **Selectors break constantly.** Your default strategy on every LinkedIn page should be:
1. `browser_screenshot()` — see the page visually
2. Pick the target's position from the image
3. `browser_coords(image_x, image_y)` → get CSS pixels
4. `browser_click_coordinate(css_x, css_y)` — reaches shadow DOM, iframes, and React elements indifferently
5. `browser_type(use_insert_text=True, text=...)` — types into whatever is focused, including Lexical composers
**If `browser_evaluate(...querySelectorAll...)` returns `[]` even once, do not try a different selector.** Stop, screenshot, and click. The "what if I try `.artdeco-list__item` next" instinct has burned ~50 tool calls in real sessions before the agent pivoted. Don't fall into that loop.
The selectors in the table below are **only** for when you already know the target is in the light DOM and you want a faster path than screenshot+coord. **When in doubt, default to coordinates.**
## Invitation manager — inline message button path is BROKEN
If the user asks to message a connection request **from the invitation manager page without accepting first**, the inline "Message" button opens a composer inside a nested **iframe overlay** (not a shadow root). The iframe's `contentDocument` is either cross-origin-blocked or not hydrated at access time. This path is **not reliably automatable today.**
**Redirect:** click the person's name/profile link on the card, go to the profile page, and use the standard Profile Message flow below. The profile flow is battle-tested; the inline-iframe flow isn't.
If you end up writing `document.activeElement.tagName === 'IFRAME'` inside a `browser_evaluate`, you've hit this trap. Stop and go to the profile page.
## Timing expectations
- `browser_navigate(wait_until="load")` — LinkedIn takes **45 seconds** to load the feed cold.
@@ -1,6 +1,6 @@
---
name: hive.note-taking
description: Maintain structured working notes throughout execution to prevent information loss during context pruning.
description: Maintain a free-form scratchpad of decisions, extracted values, and open questions so context pruning doesn't lose anything you still need.
metadata:
author: hive
type: default-skill
@@ -8,20 +8,21 @@ metadata:
## Operational Protocol: Structured Note-Taking
Maintain structured working notes in shared buffer key `_working_notes`.
Maintain free-form working notes in shared buffer key `_working_notes` for data that *you* need to remember but that isn't captured by the colony task queue.
**Do not duplicate the queue in here.** Per-task goal, ordered steps, and SOP gates live in `progress.db` — use `hive.colony-progress-tracker` for those. These notes are for things the DB schema doesn't cover.
Update at these checkpoints:
- After completing each discrete subtask or batch item
- After receiving new information that changes your plan
- Before any tool call that will produce substantial output
- After receiving new information that changes how you plan to approach the current step
- Before any tool call that will produce substantial output you'll need to reference later
- When you make a non-obvious decision whose *why* would be lost if the tool call history gets pruned
Structure:
### Objective — restate the goal
### Current Plan — numbered steps, mark completed with ✓
### Key Decisions — decisions made and WHY
### Working Data — intermediate results, extracted values
### Open Questions — uncertainties to verify
### Blockers — anything preventing progress
### Working Data — intermediate results, extracted values (URLs, IDs, key snippets — not full pages)
### Open Questions — uncertainties you plan to verify
### Blockers — anything preventing progress that isn't already captured in `tasks.last_error`
Update incrementally — do not rewrite from scratch each time.
@@ -1,17 +0,0 @@
---
name: hive.task-decomposition
description: Decompose complex tasks into explicit subtasks before diving in.
metadata:
author: hive
type: default-skill
---
## Operational Protocol: Task Decomposition
Before starting a complex task:
1. Decompose — break into numbered subtasks in `_working_notes` Current Plan
2. Estimate — relative effort per subtask (small/medium/large)
3. Execute — work through in order, mark ✓ when complete
4. Budget — if running low on iterations, prioritize by impact
5. Verify — before declaring done, every subtask must be ✓, skipped (with reason), or blocked
+2 -2
View File
@@ -36,8 +36,8 @@ class SkillsConfig:
# Default skill configuration
default_skills = {
"hive.note-taking": {"enabled": True},
"hive.batch-ledger": {"enabled": True, "checkpoint_every_n": 10},
"hive.quality-monitor": {"enabled": False},
"hive.quality-monitor": {"enabled": False, "assessment_interval": 10},
"hive.error-recovery": {"max_retries_per_tool": 5},
}
"""
+32 -49
View File
@@ -24,34 +24,21 @@ _SKILL_DEFAULTS: dict[str, dict[str, Any]] = {
"hive.quality-monitor": {"assessment_interval": 5},
"hive.error-recovery": {"max_retries_per_tool": 3},
"hive.context-preservation": {"warn_at_usage_ratio_pct": 45},
"hive.batch-ledger": {"checkpoint_every_n": 5},
}
# Keywords that indicate a batch processing scenario (DS-12)
_BATCH_KEYWORDS: tuple[str, ...] = (
"list of",
"collection of",
"set of",
"batch of",
"each item",
"for each",
"process all",
"records",
"entries",
"rows",
"items",
)
_BATCH_INIT_NUDGE = (
"Note: your input appears to describe a batch operation. "
"Initialize `_batch_ledger` with the total item count before processing."
)
def is_batch_scenario(text: str) -> bool:
"""Return True if *text* contains batch-processing indicators (DS-12)."""
lower = text.lower()
return any(kw in lower for kw in _BATCH_KEYWORDS)
"""Deprecated: batch auto-detection is no longer used.
Kept as a no-op so the agent_loop call site (which wraps it in an
``if ctx.default_skill_batch_nudge:`` guard that's also now always
empty) can stay unchanged until a broader cleanup. The old
``_batch_ledger`` shared-buffer feature was replaced by the
per-colony SQLite task queue (``hive.colony-progress-tracker``),
which lives in ``progress.db`` and is authoritative for batch
state across workers and runs.
"""
return False
def _apply_overrides(skill_name: str, body: str, overrides: dict[str, Any]) -> str:
@@ -67,40 +54,37 @@ def _apply_overrides(skill_name: str, body: str, overrides: dict[str, Any]) -> s
return body
# Ordered list of default skills (name → directory)
# Ordered list of default skills (name → directory).
#
# Removed on 2026-04-15 as part of the colony-progress-tracker rollout:
# - hive.task-decomposition — steps table in progress.db supersedes
# in-memory ``_working_notes → Current Plan`` decomposition.
# - hive.batch-ledger — tasks table in progress.db supersedes
# the ``_batch_ledger`` dict-shaped queue with its pending →
# in_progress → completed/failed/skipped state machine.
# Both were duplicating state that belongs in SQLite.
SKILL_REGISTRY: dict[str, str] = {
"hive.note-taking": "note-taking",
"hive.batch-ledger": "batch-ledger",
"hive.context-preservation": "context-preservation",
"hive.quality-monitor": "quality-monitor",
"hive.error-recovery": "error-recovery",
"hive.task-decomposition": "task-decomposition",
"hive.colony-progress-tracker": "colony-progress-tracker",
"hive.writing-hive-skills": "writing-hive-skills",
}
# All shared buffer keys used by default skills (for permission auto-inclusion)
# Shared buffer keys referenced by the remaining default skills (used
# for permission auto-inclusion). The dead keys for batch-ledger,
# task-decomposition, the handoff buffer, and the error-log buffers
# were removed when those features migrated to progress.db.
DATA_BUFFER_KEYS: list[str] = [
# note-taking
"_working_notes",
"_notes_updated_at",
# batch-ledger
"_batch_ledger",
"_batch_total",
"_batch_completed",
"_batch_failed",
# context-preservation
"_handoff_context",
"_preserved_data",
# quality-monitor
"_quality_log",
"_quality_degradation_count",
# error-recovery
"_error_log",
"_failed_tools",
"_escalation_needed",
# task-decomposition
"_subtasks",
"_iteration_budget_remaining",
]
@@ -252,16 +236,15 @@ class DefaultSkillManager:
@property
def batch_init_nudge(self) -> str | None:
"""Nudge text to prepend to system prompt when batch input detected (DS-12).
"""Deprecated: always returns None.
Returns None if ``hive.batch-ledger`` is disabled or auto_detect_batch is False.
The ``hive.batch-ledger`` default skill was removed when batch
tracking moved into ``progress.db`` (``hive.colony-progress-
tracker``). Callers in agent_host, colony_runtime, and
orchestrator still read this property; returning None keeps
them functional with no system-prompt nudge.
"""
if "hive.batch-ledger" not in self._skills:
return None
overrides = self._config.get_default_overrides("hive.batch-ledger")
if overrides.get("auto_detect_batch") is False:
return None
return _BATCH_INIT_NUDGE
return None
@property
def context_warn_ratio(self) -> float | None:
+541 -58
View File
@@ -903,10 +903,76 @@ def register_queen_lifecycle_tools(
# ``start_worker`` was removed in the Phase 4 unification — its
# bare-bones spawn duplicated ``run_agent_with_input`` (which has
# credential preflight, concurrency guard, and phase tracking on
# top). The shared preflight timeout below is still used by
# ``run_agent_with_input``.
# top). The shared preflight timeout below is used by both
# ``run_agent_with_input`` and ``run_parallel_workers``.
_START_PREFLIGHT_TIMEOUT = 15 # seconds
async def _preflight_credentials(
legacy: Any,
*,
tool_label: str,
) -> set[str]:
"""Compute tools whose credentials are missing and resync MCP servers.
Shared between ``run_agent_with_input`` (single spawn) and
``run_parallel_workers`` (batch spawn). Returns the set of
tool names whose credentials failed validation; the caller
filters these out of the spawn's tool lists.
Exceptions (including validator bugs) are logged and treated
as "no tools dropped" so a broken validator can't block a
spawn. Wall-clock bound at ``_START_PREFLIGHT_TIMEOUT``
slow credential HTTP health checks can't stall the LLM turn.
"""
unavailable: set[str] = set()
async def _run() -> None:
nonlocal unavailable
try:
from framework.credentials.validation import compute_unavailable_tools
loop = asyncio.get_running_loop()
drop, messages = await loop.run_in_executor(
None,
lambda: compute_unavailable_tools(legacy.graph.nodes),
)
unavailable = drop
if drop:
logger.warning(
"%s: dropping %d tool(s) with unavailable credentials: %s",
tool_label,
len(drop),
"; ".join(messages),
)
except Exception as exc:
logger.warning(
"%s: compute_unavailable_tools raised, proceeding without "
"credential-based tool filtering: %s",
tool_label,
exc,
)
runner = getattr(session, "runner", None)
if runner is not None:
try:
loop = asyncio.get_running_loop()
await loop.run_in_executor(
None,
lambda: runner._tool_registry.resync_mcp_servers_if_needed(),
)
except Exception as exc:
logger.warning("%s: MCP resync failed: %s", tool_label, exc)
try:
await asyncio.wait_for(_run(), timeout=_START_PREFLIGHT_TIMEOUT)
except TimeoutError:
logger.warning(
"%s: credential preflight timed out after %ds — proceeding",
tool_label,
_START_PREFLIGHT_TIMEOUT,
)
return unavailable
# --- stop_worker -----------------------------------------------------------
async def stop_worker(*, reason: str = "Stopped by queen") -> str:
@@ -1078,6 +1144,105 @@ def register_queen_lifecycle_tools(
}
)
# Credential preflight — mirrors the one run_agent_with_input
# performs. Without this, missing credentials (e.g. stale
# GITHUB_TOKEN) fail once PER spawned worker, yielding N
# duplicate error reports for a single fixable issue. Catch
# once upfront, build a filtered tool list, and pass it to
# every spawn via tools_override.
legacy_for_preflight = _get_runtime()
unavailable_tools_parallel: set[str] = set()
tools_override_parallel: list[Any] | None = None
if legacy_for_preflight is not None:
try:
unavailable_tools_parallel = await _preflight_credentials(
legacy_for_preflight, tool_label="run_parallel_workers"
)
except CredentialError as e:
# Structured credential failure: publish the
# CREDENTIALS_REQUIRED event so the frontend's modal
# can fire, and return the same shape the single-path
# tool returns on the same failure.
error_payload = credential_errors_to_json(e)
error_payload["agent_path"] = str(getattr(session, "worker_path", "") or "")
bus = getattr(session, "event_bus", None)
if bus is not None:
await bus.publish(
AgentEvent(
type=EventType.CREDENTIALS_REQUIRED,
stream_id="queen",
data=error_payload,
)
)
return json.dumps(error_payload)
if unavailable_tools_parallel:
colony_tools = list(getattr(colony, "_tools", []) or [])
before = len(colony_tools)
tools_override_parallel = [
t
for t in colony_tools
if getattr(t, "name", None) not in unavailable_tools_parallel
]
logger.info(
"run_parallel_workers: dropped %d tool object(s) from spawn_tools (unavailable credentials)",
before - len(tools_override_parallel),
)
# Colony progress tracker wiring: if the session's loaded
# worker points at a colony directory that has a progress.db,
# inject db_path + colony_id into every per-task ``data``
# dict so each spawned worker sees them in its first user
# message and can claim rows from the queue. ColonyRuntime.
# spawn() detects db_path in input_data and pre-activates
# hive.colony-progress-tracker into the catalog prompt.
_colony_db_path: str | None = None
_colony_id: str | None = None
_worker_path = getattr(session, "worker_path", None)
if _worker_path:
from pathlib import Path as _Path
_wp = _Path(_worker_path)
_pdb = _wp / "data" / "progress.db"
if _pdb.exists():
_colony_db_path = str(_pdb.resolve())
_colony_id = _wp.name
# Phase 2: enqueue each task into progress.db BEFORE building
# spawn specs so every parallel worker has a pre-assigned row
# to claim. Without this the queue stays empty and each
# worker's claim UPDATE affects zero rows, silently falling
# back to executing from its spawn message.
_enqueued_task_ids: list[str | None] = [None] * len(tasks)
if _colony_db_path:
from pathlib import Path as _PathP
from framework.host.progress_db import (
enqueue_task as _enqueue_task_fn,
)
_pdb_path_obj = _PathP(_colony_db_path)
for _i, _spec in enumerate(tasks):
if not isinstance(_spec, dict):
continue
_task_text_pre = str(_spec.get("task", "")).strip()
if not _task_text_pre:
continue
try:
_enqueued_task_ids[_i] = await asyncio.to_thread(
_enqueue_task_fn,
_pdb_path_obj,
_task_text_pre,
source="run_parallel_workers",
)
except Exception as _enqueue_exc:
logger.warning(
"run_parallel_workers: failed to enqueue tasks[%d] "
"(spawn proceeding without pinned task_id): %s",
_i,
_enqueue_exc,
)
# Normalise: each entry must have a non-empty "task" string.
normalised: list[dict] = []
for i, spec in enumerate(tasks):
@@ -1086,18 +1251,58 @@ def register_queen_lifecycle_tools(
task_text = str(spec.get("task", "")).strip()
if not task_text:
return json.dumps({"error": f"tasks[{i}].task is empty"})
spec_data = spec.get("data") if isinstance(spec.get("data"), dict) else {}
if _colony_db_path:
spec_data = {
**spec_data,
"db_path": _colony_db_path,
"colony_id": _colony_id,
}
if _enqueued_task_ids[i]:
spec_data["task_id"] = _enqueued_task_ids[i]
normalised.append(
{
"task": task_text,
"data": spec.get("data") if isinstance(spec.get("data"), dict) else None,
"data": spec_data or None,
}
)
if _colony_db_path:
_pinned = sum(1 for tid in _enqueued_task_ids if tid)
logger.info(
"run_parallel_workers: attached progress_db context to "
"%d spawn(s) (colony_id=%s, %d pinned task_ids)",
len(normalised),
_colony_id,
_pinned,
)
try:
worker_ids = await colony.spawn_batch(normalised)
worker_ids = await colony.spawn_batch(
normalised,
tools_override=tools_override_parallel,
)
except Exception as e:
return json.dumps({"error": f"spawn_batch failed: {e}"})
# Phase transition — mirrors run_agent_with_input. With the
# batch now spawned, the queen is semantically "running" until
# wait_for_worker_reports returns, so phase-gated running
# tools (inject_message, reply_to_worker, ...) should be
# available. Without this change run_parallel_workers left
# the queen in whatever phase she was in (typically staging).
if phase_state is not None:
try:
await phase_state.switch_to_running()
_update_meta_json(
session_manager, manager_session_id, {"phase": "running"}
)
except Exception as exc:
logger.warning(
"run_parallel_workers: phase transition to 'running' failed (non-fatal): %s",
exc,
)
try:
reports = await colony.wait_for_worker_reports(
worker_ids,
@@ -1322,6 +1527,35 @@ def register_queen_lifecycle_tools(
except OSError as e:
return None, f"failed to install skill into {target}: {e}"
# Cleanup the source directory after a successful install so
# the authored skill doesn't linger as debris in the agent
# workspace (or — pre-sandbox-split — in the hive git
# checkout). Only removes paths that are OUTSIDE
# ``~/.hive/skills/`` so we never nuke the canonical install
# target or user-owned skill dirs.
try:
src_resolved = src.resolve()
skills_root_resolved = target_root.resolve()
try:
src_resolved.relative_to(skills_root_resolved)
_under_skills_root = True
except ValueError:
_under_skills_root = False
if not _under_skills_root:
_shutil.rmtree(src_resolved)
logger.info(
"create_colony: cleaned up authored skill source at %s "
"(installed to %s)",
src_resolved,
target,
)
except OSError as e:
logger.warning(
"create_colony: failed to clean up skill source at %s (non-fatal): %s",
src,
e,
)
return target, None
async def create_colony(
@@ -1329,6 +1563,7 @@ def register_queen_lifecycle_tools(
colony_name: str,
task: str,
skill_path: str,
tasks: list[dict] | None = None,
) -> str:
"""Create a colony after installing a pre-authored skill folder.
@@ -1338,6 +1573,13 @@ def register_queen_lifecycle_tools(
they're ready to start the worker — at that point the worker
reads the task from ``worker.json`` and the skill from
``~/.hive/skills/`` and starts informed.
When *tasks* is provided, each entry is seeded into the
colony's ``progress.db`` task queue in a single transaction.
Workers then claim rows from the queue using the
``hive.colony-progress-tracker`` default skill. Each task dict
accepts: ``goal`` (required), optional ``steps``,
``sop_items``, ``priority``, ``payload``, ``parent_task_id``.
"""
if session is None:
return json.dumps({"error": "No session bound to this tool registry."})
@@ -1392,6 +1634,7 @@ def register_queen_lifecycle_tools(
session=session,
colony_name=cn,
task=(task or "").strip(),
tasks=tasks if isinstance(tasks, list) else None,
)
except Exception as e:
logger.exception("create_colony: fork failed after installing skill")
@@ -1444,6 +1687,8 @@ def register_queen_lifecycle_tools(
"is_new": fork_result.get("is_new", True),
"skill_installed": str(installed_skill),
"skill_name": installed_skill.name if installed_skill else None,
"db_path": fork_result.get("db_path"),
"tasks_seeded": len(fork_result.get("task_ids") or []),
}
)
@@ -1541,6 +1786,57 @@ def register_queen_lifecycle_tools(
"protocol'."
),
},
"tasks": {
"type": "array",
"description": (
"Optional pre-seeded task queue for the colony. "
"When the colony is a fan-out of many similar "
"units of work (e.g. 'process record #1234', "
"'scrape profile X'), pass them here as an "
"array and workers will claim rows atomically "
"from the SQLite queue using the "
"hive.colony-progress-tracker skill. Each task "
"needs a 'goal' string; optionally include "
"'steps' (ordered subtasks), 'sop_items' "
"(required checklist gates), 'priority' "
"(higher runs first), and 'payload' "
"(task-specific parameters). Can be hundreds "
"or thousands of entries — the bulk insert "
"runs in a single transaction."
),
"items": {
"type": "object",
"properties": {
"goal": {"type": "string"},
"priority": {"type": "integer"},
"payload": {},
"steps": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"detail": {"type": "string"},
},
"required": ["title"],
},
},
"sop_items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"key": {"type": "string"},
"description": {"type": "string"},
"required": {"type": "boolean"},
},
"required": ["key", "description"],
},
},
},
"required": ["goal"],
},
},
},
"required": ["colony_name", "task", "skill_path"],
},
@@ -1552,6 +1848,158 @@ def register_queen_lifecycle_tools(
)
tools_registered += 1
# --- enqueue_task ------------------------------------------------------------
async def enqueue_task_tool(
*,
colony_name: str,
goal: str,
steps: list[dict] | None = None,
sop_items: list[dict] | None = None,
payload: Any = None,
priority: int = 0,
parent_task_id: str | None = None,
) -> str:
"""Append a single task to an existing colony's progress.db queue.
Use this when the colony is already created and more work
needs to be fanned out (webhook-driven, follow-up requests,
worker-generated subtasks). The colony's workers pick it up
on their next claim cycle.
"""
cn = (colony_name or "").strip()
if not _COLONY_NAME_RE.match(cn):
return json.dumps(
{"error": "colony_name must be lowercase alphanumeric with underscores"}
)
from pathlib import Path as _Path
from framework.host.progress_db import (
enqueue_task as _enqueue_task,
ensure_progress_db as _ensure_db,
)
colony_dir = _Path.home() / ".hive" / "colonies" / cn
if not colony_dir.is_dir():
return json.dumps({"error": f"colony '{cn}' not found"})
try:
db_path = await asyncio.to_thread(_ensure_db, colony_dir)
task_id = await asyncio.to_thread(
_enqueue_task,
db_path,
goal,
steps=steps,
sop_items=sop_items,
payload=payload,
priority=priority,
parent_task_id=parent_task_id,
)
except Exception as e:
logger.exception("enqueue_task: failed to insert row")
return json.dumps({"error": f"enqueue_task failed: {e}"})
return json.dumps(
{
"status": "enqueued",
"colony_name": cn,
"task_id": task_id,
"db_path": str(db_path),
}
)
_enqueue_task_tool = Tool(
name="enqueue_task",
description=(
"Append a single task to an existing colony's progress.db "
"queue. Use this after create_colony when more work needs "
"to be fanned out — e.g. a webhook fired, the user asked "
"for a follow-up run, or a worker spawned a subtask. The "
"colony's workers pick it up on their next claim cycle "
"(atomic UPDATE … WHERE status='pending'). For bulk "
"authoring at colony creation time, pass the 'tasks' "
"array to create_colony instead."
),
parameters={
"type": "object",
"properties": {
"colony_name": {
"type": "string",
"description": "Target colony name (lowercase + underscores).",
},
"goal": {
"type": "string",
"description": (
"Human-readable task description. Self-contained — "
"the worker has no context beyond this string plus "
"any steps/sop_items/payload you attach."
),
},
"steps": {
"type": "array",
"description": (
"Optional ordered subtasks the worker should "
"check off as it executes. Each step needs a "
"'title'; optional 'detail' for longer "
"instructions."
),
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"detail": {"type": "string"},
},
"required": ["title"],
},
},
"sop_items": {
"type": "array",
"description": (
"Optional hard-gate checklist items the worker "
"MUST address before marking the task done. "
"Each item needs a 'key' (slug) and "
"'description'; 'required' defaults to true."
),
"items": {
"type": "object",
"properties": {
"key": {"type": "string"},
"description": {"type": "string"},
"required": {"type": "boolean"},
},
"required": ["key", "description"],
},
},
"payload": {
"description": (
"Optional task-specific parameters. Stored as "
"JSON in the 'payload' column."
),
},
"priority": {
"type": "integer",
"description": "Higher values run first. Default 0.",
},
"parent_task_id": {
"type": "string",
"description": (
"Optional reference to an existing task this "
"one was spawned from (audit only; no blocking "
"dependency resolver today)."
),
},
},
"required": ["colony_name", "goal"],
},
)
registry.register(
"enqueue_task",
_enqueue_task_tool,
lambda inputs: enqueue_task_tool(**inputs),
)
tools_registered += 1
# --- switch_to_reviewing ----------------------------------------------------
async def switch_to_reviewing_tool() -> str:
@@ -2969,7 +3417,8 @@ def register_queen_lifecycle_tools(
if preamble.get("pending_question"):
result["pending_question"] = preamble["pending_question"]
result["agent_idle_seconds"] = round(runtime.agent_idle_seconds, 1)
_idle = runtime.agent_idle_seconds
result["agent_idle_seconds"] = round(_idle, 1) if _idle != float("inf") else -1
for key in ("current_node", "current_iteration"):
if key in preamble:
@@ -3713,6 +4162,33 @@ def register_queen_lifecycle_tools(
task,
)
# Concurrency budget check — mirrors run_parallel_workers so a
# queen in a loop can't silently exceed max_concurrent_workers
# by hammering run_agent_with_input. Per-call count is 1, so
# the check is ``active + 1 > max_concurrent``.
colony_cfg = getattr(colony, "_config", None) or getattr(colony, "config", None)
max_concurrent = getattr(colony_cfg, "max_concurrent_workers", None)
if max_concurrent and max_concurrent > 0:
active = 0
try:
workers = getattr(colony, "_workers", {}) or {}
for w in workers.values():
handle = getattr(w, "_task_handle", None)
if handle is not None and not handle.done():
active += 1
except Exception:
active = 0
if active + 1 > max_concurrent:
return json.dumps(
{
"error": (
f"run_agent_with_input would exceed max_concurrent_workers "
f"({active} active + 1 new > {max_concurrent}). "
"Wait for an existing worker to finish or stop one."
)
}
)
try:
# Pre-flight: compute the set of tools whose credentials are
# NOT currently available, and resync MCP servers. We do NOT
@@ -3723,58 +4199,9 @@ def register_queen_lifecycle_tools(
# to block the whole spawn with a CredentialError; the fix
# is to treat unset credentials as "drop these tools" rather
# than "abort the worker".
#
# Note: the MCP admission gate (_build_mcp_admission_gate in
# tool_registry.py) already filters MCP tools at registration
# time. This preflight covers the non-MCP path — tools.py
# discoveries via discover_from_module — which has no
# credential gate of its own.
loop = asyncio.get_running_loop()
unavailable_tools: set[str] = set()
async def _preflight():
nonlocal unavailable_tools
try:
from framework.credentials.validation import compute_unavailable_tools
drop, messages = await loop.run_in_executor(
None,
lambda: compute_unavailable_tools(legacy.graph.nodes),
)
unavailable_tools = drop
if drop:
logger.warning(
"run_agent_with_input: dropping %d tool(s) with "
"unavailable credentials from worker spawn: %s",
len(drop),
"; ".join(messages),
)
except Exception as exc:
# Validation itself failing (not a credential failure —
# a code error in the validator) should not block the
# spawn. Log and proceed as if nothing was dropped.
logger.warning(
"compute_unavailable_tools raised, proceeding without credential-based tool filtering: %s",
exc,
)
runner = getattr(session, "runner", None)
if runner:
try:
await loop.run_in_executor(
None,
lambda: runner._tool_registry.resync_mcp_servers_if_needed(),
)
except Exception as e:
logger.warning("MCP resync failed: %s", e)
try:
await asyncio.wait_for(_preflight(), timeout=_START_PREFLIGHT_TIMEOUT)
except TimeoutError:
logger.warning(
"run_agent_with_input preflight timed out after %ds — proceeding",
_START_PREFLIGHT_TIMEOUT,
)
unavailable_tools = await _preflight_credentials(
legacy, tool_label="run_agent_with_input"
)
# Build a per-spawn AgentSpec that mirrors the loaded
# worker's entry-node identity. This is what makes the
@@ -3848,10 +4275,66 @@ def register_queen_lifecycle_tools(
dropped_count,
)
# Colony progress tracker wiring: if the loaded worker
# lives under ~/.hive/colonies/{name}/ and has a
# progress.db, inject db_path + colony_id into input_data
# so the spawned worker sees them in its first user
# message and can use the hive.colony-progress-tracker
# skill to claim tasks from the queue.
_spawn_input_data: dict[str, Any] = {"user_request": task}
_worker_path = getattr(session, "worker_path", None)
if _worker_path:
from pathlib import Path as _Path
_worker_path_p = _Path(_worker_path)
_progress_db = _worker_path_p / "data" / "progress.db"
if _progress_db.exists():
_spawn_input_data["db_path"] = str(_progress_db.resolve())
_spawn_input_data["colony_id"] = _worker_path_p.name
logger.info(
"run_agent_with_input: attached progress_db context "
"(colony_id=%s, db_path=%s)",
_worker_path_p.name,
_progress_db,
)
# Phase 2: enqueue the task into progress.db BEFORE
# spawning so the worker has a concrete row to
# claim. Without this the queue is empty and the
# worker's claim UPDATE affects zero rows, so it
# silently falls back to executing from the chat
# spawn message. Any enqueue failure is logged and
# the spawn proceeds without a pinned task_id
# (degrades to the pre-Phase-2 behavior).
try:
from framework.host.progress_db import (
enqueue_task as _enqueue_task_fn,
)
_task_id = await asyncio.to_thread(
_enqueue_task_fn,
_progress_db,
task,
source="run_agent_with_input",
)
_spawn_input_data["task_id"] = _task_id
logger.info(
"run_agent_with_input: enqueued task %s into %s",
_task_id,
_progress_db,
)
except Exception as _enqueue_exc:
logger.warning(
"run_agent_with_input: failed to enqueue task "
"into progress.db (spawn proceeding without "
"pinned task_id): %s",
_enqueue_exc,
)
worker_ids = await colony.spawn(
task=task,
count=1,
input_data={"user_request": task},
input_data=_spawn_input_data,
agent_spec=spawn_spec,
tools=spawn_tools,
tool_executor=spawn_tool_executor,
+19 -3
View File
@@ -87,9 +87,25 @@ export const sessionsApi = {
colonies: (sessionId: string) =>
api.get<{ colonies: string[] }>(`/sessions/${sessionId}/colonies`),
/** Get persisted eventbus log for a session (works for cold sessions — used for full UI replay). */
eventsHistory: (sessionId: string) =>
api.get<{ events: AgentEvent[]; session_id: string }>(`/sessions/${sessionId}/events/history`),
/** Get persisted eventbus log for a session (works for cold sessions — used for full UI replay).
*
* Returns the TAIL of the event log. Default limit 2000 (server
* clamps to [1, 10000]); older events get dropped and
* ``truncated: true`` is set so the UI can show an indicator.
*/
eventsHistory: (sessionId: string, limit?: number) =>
api.get<{
events: AgentEvent[];
session_id: string;
total: number;
returned: number;
truncated: boolean;
limit: number;
}>(
`/sessions/${sessionId}/events/history${
limit ? `?limit=${limit}` : ""
}`,
),
/** Open the session's data folder in the OS file manager. */
revealFolder: (sessionId: string) =>
@@ -31,6 +31,15 @@ export default function AppHeader({ onOpenQueenProfile }: AppHeaderProps) {
const colonyId = colonyMatch[1];
const colony = colonies.find((c) => c.id === colonyId);
title = colony?.name ?? colonyId;
// Show queen profile button when the colony has a linked queen profile
if (colony?.queenProfileId) {
const profile = queenProfiles.find((q) => q.id === colony.queenProfileId);
if (profile) {
queenIdForProfile = profile.id;
queenTitle = profile.title ?? null;
icon = <Crown className="w-4 h-4 text-primary" />;
}
}
} else if (queenMatch) {
const queenId = queenMatch[1];
const profile = queenProfiles.find((q) => q.id === queenId);
+196 -6
View File
@@ -10,6 +10,8 @@ import {
Paperclip,
X,
} from "lucide-react";
import WorkerRunBubble from "@/components/WorkerRunBubble";
import type { WorkerRunGroup } from "@/components/WorkerRunBubble";
export interface ImageContent {
type: "image_url";
@@ -25,6 +27,8 @@ export interface ContextUsageEntry {
import MarkdownContent from "@/components/MarkdownContent";
import QuestionWidget from "@/components/QuestionWidget";
import MultiQuestionWidget from "@/components/MultiQuestionWidget";
import { useColony } from "@/context/ColonyContext";
import { useQueenProfile } from "@/context/QueenProfileContext";
import ParallelSubagentBubble, {
type SubagentGroup,
} from "@/components/ParallelSubagentBubble";
@@ -60,6 +64,12 @@ export interface ChatMessage {
nodeId?: string;
/** Backend execution_id for this message */
executionId?: string;
/** Backend stream_id — the per-worker identity used for grouping
* parallel-spawn workers into their own stacked WorkerRunBubble.
* "queen" for queen messages, "worker" for the single loaded
* worker (run_agent_with_input), or "worker:{uuid}" for each
* parallel worker spawned via run_parallel_workers. */
streamId?: string;
/** True when the message was sent while the queen was still processing */
queued?: boolean;
}
@@ -124,14 +134,14 @@ const TOOL_HEX = [
"#e5a820", // sunflower
];
function toolHex(name: string): string {
export function toolHex(name: string): string {
let hash = 0;
for (let i = 0; i < name.length; i++)
hash = (hash * 31 + name.charCodeAt(i)) | 0;
return TOOL_HEX[Math.abs(hash) % TOOL_HEX.length];
}
function ToolActivityRow({ content }: { content: string }) {
export function ToolActivityRow({ content }: { content: string }) {
let tools: { name: string; done: boolean }[] = [];
try {
const parsed = JSON.parse(content);
@@ -336,6 +346,15 @@ function InlineAskUserBubble({
const color = getColor(msg.agent, msg.role);
const thread = msg.thread || activeThread;
const { queenProfiles } = useColony();
const { openQueenProfile } = useQueenProfile();
const queenProfileId = isQueen
? queenProfiles.find((q) => q.name === msg.agent)?.id ?? null
: null;
const handleQueenClick = queenProfileId
? () => openQueenProfile(queenProfileId)
: undefined;
const handleSingle = (answer: string) => {
setState("submitted");
onSend(answer, thread);
@@ -355,12 +374,14 @@ function InlineAskUserBubble({
return (
<div className="flex gap-3">
<div
className={`flex-shrink-0 ${isQueen ? "w-9 h-9" : "w-7 h-7"} rounded-xl flex items-center justify-center`}
className={`flex-shrink-0 ${isQueen ? "w-9 h-9" : "w-7 h-7"} rounded-xl flex items-center justify-center${handleQueenClick ? " cursor-pointer hover:opacity-80 transition-opacity" : ""}`}
style={{
backgroundColor: `${color}18`,
border: `1.5px solid ${color}35`,
boxShadow: isQueen ? `0 0 12px ${color}20` : undefined,
}}
onClick={handleQueenClick}
title={handleQueenClick ? `View ${msg.agent}'s profile` : undefined}
>
{isQueen ? (
<Crown className="w-4 h-4" style={{ color }} />
@@ -373,8 +394,9 @@ function InlineAskUserBubble({
>
<div className="flex items-center gap-2 mb-1">
<span
className={`font-medium ${isQueen ? "text-sm" : "text-xs"}`}
className={`font-medium ${isQueen ? "text-sm" : "text-xs"}${handleQueenClick ? " cursor-pointer hover:underline" : ""}`}
style={{ color }}
onClick={handleQueenClick}
>
{msg.agent}
</span>
@@ -435,6 +457,13 @@ const MessageBubble = memo(
const isQueen = msg.role === "queen";
const color = getColor(msg.agent, msg.role);
// Resolve queen profile ID so clicking avatar/name opens the profile panel
const { queenProfiles } = useColony();
const { openQueenProfile } = useQueenProfile();
const queenProfileId = isQueen
? queenProfiles.find((q) => q.name === msg.agent)?.id ?? null
: null;
if (msg.type === "run_divider") {
return (
<div className="flex items-center gap-3 py-2 my-1">
@@ -529,15 +558,21 @@ const MessageBubble = memo(
);
}
const handleQueenClick = queenProfileId
? () => openQueenProfile(queenProfileId)
: undefined;
return (
<div className="flex gap-3">
<div
className={`flex-shrink-0 ${isQueen ? "w-9 h-9" : "w-7 h-7"} rounded-xl flex items-center justify-center`}
className={`flex-shrink-0 ${isQueen ? "w-9 h-9" : "w-7 h-7"} rounded-xl flex items-center justify-center${handleQueenClick ? " cursor-pointer hover:opacity-80 transition-opacity" : ""}`}
style={{
backgroundColor: `${color}18`,
border: `1.5px solid ${color}35`,
boxShadow: isQueen ? `0 0 12px ${color}20` : undefined,
}}
onClick={handleQueenClick}
title={handleQueenClick ? `View ${msg.agent}'s profile` : undefined}
>
{isQueen ? (
<Crown className="w-4 h-4" style={{ color }} />
@@ -550,8 +585,9 @@ const MessageBubble = memo(
>
<div className="flex items-center gap-2 mb-1">
<span
className={`font-medium ${isQueen ? "text-sm" : "text-xs"}`}
className={`font-medium ${isQueen ? "text-sm" : "text-xs"}${handleQueenClick ? " cursor-pointer hover:underline" : ""}`}
style={{ color }}
onClick={handleQueenClick}
>
{msg.agent}
</span>
@@ -665,14 +701,157 @@ export default function ChatPanel({
type RenderItem =
| { kind: "message"; msg: ChatMessage }
| { kind: "parallel"; groupId: string; groups: SubagentGroup[] }
| {
kind: "worker_run";
runId: string;
group: WorkerRunGroup;
/** Optional short label shown next to the "Worker" badge.
* Only set when there are multiple parallel workers in the
* same run span (so users can tell them apart). */
label?: string;
}
| { kind: "day_divider"; key: string; createdAt: number };
/** Derive a short label from a parallel-worker stream id.
* `worker:abcdef12-3456-...` → `abcdef12` (first 8 chars of the
* uuid after the `worker:` prefix). Falls back to the first
* message's nodeId when the streamId isn't the expected shape. */
function deriveWorkerLabel(
streamKey: string,
msgs: ChatMessage[],
): string {
if (streamKey.startsWith("worker:")) {
const suffix = streamKey.slice("worker:".length);
// sessions are `session_YYYYMMDD_HHMMSS_<8-hex>` — show the
// trailing hex if present, else first 8 chars of the suffix.
const tail = suffix.match(/_[0-9a-f]{6,}$/i)?.[0]?.slice(1);
return tail ? tail.slice(0, 8) : suffix.slice(0, 8);
}
const nid = msgs.find((m) => m.nodeId)?.nodeId;
return nid || streamKey;
}
const renderItems = useMemo<RenderItem[]>(() => {
const items: RenderItem[] = [];
let i = 0;
while (i < threadMessages.length) {
const msg = threadMessages[i];
const isSubagent = msg.nodeId?.includes(":subagent:");
// Worker run grouping: collect consecutive WORKER-role
// messages (and worker tool_status pills) into a collapsible
// card. Queen tool_status pills (``role === "queen"``) are
// deliberately excluded — the queen's own tool calls are part
// of the queen↔user conversation and should render inline as
// ToolActivityRows, not fold into a "Worker" bubble. Without
// this guard, every queen run_command / read_file / etc. shows
// up under a misleading "Worker" label in the DM.
const isWorkerCandidate =
msg.role === "worker" ||
(msg.type === "tool_status" && msg.role !== "queen");
if (
!isSubagent &&
isWorkerCandidate &&
msg.type !== "user" &&
msg.type !== "run_divider"
) {
const workerMsgs: ChatMessage[] = [];
const firstWorkerMsg = msg;
while (i < threadMessages.length) {
const m = threadMessages[i];
// Hard boundary — stop the worker run group
if (m.type === "user" || m.type === "run_divider") break;
// Queen message with real text — boundary (queen is talking
// to the user, not just emitting a tool)
if (m.role === "queen" && m.content?.trim() && !m.type) break;
// Queen tool_status — NOT a worker activity, don't bucket
// it. Break so the grouping stops and the queen pill
// renders inline.
if (m.type === "tool_status" && m.role === "queen") break;
// Subagent message — different group type, stop here
if (m.nodeId?.includes(":subagent:")) break;
// Worker text messages and worker tool_status belong to the run
if (
m.role === "worker" ||
(m.type === "tool_status" && m.role !== "queen")
) {
workerMsgs.push(m);
i++;
continue;
}
// System message or other — include in the worker run
// group to preserve ordering (they'll render inside the
// expanded view)
workerMsgs.push(m);
i++;
}
if (workerMsgs.length > 0) {
// Parallel fan-out detection: if any message in this span
// is tagged with a parallel-worker streamId (``worker:{uuid}``),
// split the span by streamId and emit one ``worker_run``
// per worker — they render as stacked independent
// ``WorkerRunBubble``s. Un-tagged legacy messages and the
// single-worker ``streamId="worker"`` case fall through to
// the existing single-bubble behavior.
const hasParallel = workerMsgs.some(
(m) => !!m.streamId && /^worker:./.test(m.streamId),
);
if (hasParallel) {
const buckets = new Map<
string,
{ messages: ChatMessage[]; firstAt: number }
>();
// Messages with no streamId (system notes, orphans from
// old restore) attach to the most-recent keyed message's
// bucket so chronology is preserved.
let currentKey: string | null = null;
for (const m of workerMsgs) {
const key =
m.streamId && m.streamId.length > 0
? m.streamId
: currentKey;
if (!key) continue;
if (m.streamId && m.streamId.length > 0) currentKey = m.streamId;
let bucket = buckets.get(key);
if (!bucket) {
bucket = { messages: [], firstAt: m.createdAt ?? 0 };
buckets.set(key, bucket);
}
bucket.messages.push(m);
bucket.firstAt = Math.min(
bucket.firstAt,
m.createdAt ?? Number.POSITIVE_INFINITY,
);
}
const sorted = Array.from(buckets.entries()).sort(
([, a], [, b]) => a.firstAt - b.firstAt,
);
for (const [streamKey, { messages: bucketMsgs }] of sorted) {
items.push({
kind: "worker_run",
runId: `wrun-${firstWorkerMsg.id}-${streamKey}`,
group: { messages: bucketMsgs },
label: deriveWorkerLabel(streamKey, bucketMsgs),
});
}
} else {
items.push({
kind: "worker_run",
runId: `wrun-${firstWorkerMsg.id}`,
group: { messages: workerMsgs },
});
}
}
continue;
}
if (!isSubagent) {
items.push({ kind: "message", msg });
i++;
@@ -872,6 +1051,17 @@ export default function ChatPanel({
</div>
);
}
if (item.kind === "worker_run") {
return (
<div key={item.runId}>
<WorkerRunBubble
runId={item.runId}
group={item.group}
label={item.label}
/>
</div>
);
}
const msg = item.msg;
// Detect misformatted ask_user payloads emitted as plain text and
// substitute the nicer widget-based bubble. Only inspect regular
@@ -1,4 +1,4 @@
import { useState, useEffect } from "react";
import { useState, useEffect, useCallback, useRef } from "react";
import { NavLink, useLocation, useNavigate } from "react-router-dom";
import {
X,
@@ -46,8 +46,49 @@ export default function QueenProfilePanel({
const name = profile?.name ?? summary?.name ?? "Queen";
const title = profile?.title ?? summary?.title ?? "";
// ── Resizable width ──────────────────────────────────────────────────
const MIN_WIDTH = 280;
const MAX_WIDTH = 600;
const [width, setWidth] = useState(340);
const dragging = useRef(false);
const startX = useRef(0);
const startWidth = useRef(0);
const onDragStart = useCallback((e: React.MouseEvent) => {
e.preventDefault();
dragging.current = true;
startX.current = e.clientX;
startWidth.current = width;
const onMove = (ev: MouseEvent) => {
if (!dragging.current) return;
// Panel is on the right, so dragging left (negative delta) grows it
const delta = startX.current - ev.clientX;
setWidth(Math.min(MAX_WIDTH, Math.max(MIN_WIDTH, startWidth.current + delta)));
};
const onUp = () => {
dragging.current = false;
document.removeEventListener("mousemove", onMove);
document.removeEventListener("mouseup", onUp);
document.body.style.cursor = "";
document.body.style.userSelect = "";
};
document.addEventListener("mousemove", onMove);
document.addEventListener("mouseup", onUp);
document.body.style.cursor = "col-resize";
document.body.style.userSelect = "none";
}, [width]);
return (
<aside className="w-[340px] flex-shrink-0 border-l border-border/60 bg-card overflow-y-auto">
<aside
className="flex-shrink-0 border-l border-border/60 bg-card overflow-y-auto relative"
style={{ width }}
>
{/* Drag handle */}
<div
onMouseDown={onDragStart}
className="absolute top-0 left-0 w-1 h-full cursor-col-resize hover:bg-primary/30 active:bg-primary/50 transition-colors z-10"
/>
{/* Header */}
<div className="flex items-center justify-between px-5 py-3.5 border-b border-border/60">
<div className="flex items-center gap-2 text-sm font-semibold text-foreground">
+42 -2
View File
@@ -1,4 +1,4 @@
import { useState } from "react";
import { useState, useCallback, useRef } from "react";
import { useNavigate } from "react-router-dom";
import {
ChevronLeft,
@@ -22,6 +22,38 @@ export default function Sidebar() {
const [coloniesExpanded, setColoniesExpanded] = useState(true);
const [queensExpanded, setQueensExpanded] = useState(true);
// ── Resizable width ──────────────────────────────────────────────────
const MIN_WIDTH = 180;
const MAX_WIDTH = 400;
const [width, setWidth] = useState(240);
const dragging = useRef(false);
const startX = useRef(0);
const startWidth = useRef(0);
const onDragStart = useCallback((e: React.MouseEvent) => {
e.preventDefault();
dragging.current = true;
startX.current = e.clientX;
startWidth.current = width;
const onMove = (ev: MouseEvent) => {
if (!dragging.current) return;
const delta = ev.clientX - startX.current;
setWidth(Math.min(MAX_WIDTH, Math.max(MIN_WIDTH, startWidth.current + delta)));
};
const onUp = () => {
dragging.current = false;
document.removeEventListener("mousemove", onMove);
document.removeEventListener("mouseup", onUp);
document.body.style.cursor = "";
document.body.style.userSelect = "";
};
document.addEventListener("mousemove", onMove);
document.addEventListener("mouseup", onUp);
document.body.style.cursor = "col-resize";
document.body.style.userSelect = "none";
}, [width]);
if (sidebarCollapsed) {
return (
<aside className="w-[52px] flex-shrink-0 flex flex-col bg-sidebar-bg border-r border-sidebar-border h-full">
@@ -50,7 +82,15 @@ export default function Sidebar() {
}
return (
<aside className="w-[240px] flex-shrink-0 flex flex-col bg-sidebar-bg border-r border-sidebar-border h-full">
<aside
className="flex-shrink-0 flex flex-col bg-sidebar-bg border-r border-sidebar-border h-full relative"
style={{ width }}
>
{/* Drag handle on right edge */}
<div
onMouseDown={onDragStart}
className="absolute top-0 right-0 w-1 h-full cursor-col-resize hover:bg-primary/30 active:bg-primary/50 transition-colors z-10"
/>
{/* Header */}
<div className="h-12 flex items-center justify-between px-4 border-b border-border/60">
<button
@@ -0,0 +1,297 @@
import { memo, useState, useRef, useEffect } from "react";
import { ChevronDown, ChevronUp, Cpu } from "lucide-react";
import type { ChatMessage } from "@/components/ChatPanel";
import { ToolActivityRow } from "@/components/ChatPanel";
import MarkdownContent from "@/components/MarkdownContent";
const workerColor = "hsl(220,60%,55%)";
export interface WorkerRunGroup {
messages: ChatMessage[];
}
interface WorkerRunBubbleProps {
runId: string;
group: WorkerRunGroup;
/** Short identifier shown next to the "Worker" badge. Populated
* only when the parent grouping has multiple parallel workers
* in the same run span, so N stacked bubbles can be told apart
* at a glance. Omitted for single-worker runs. */
label?: string;
}
/** Parse a tool_status JSON blob into a list of tool entries. */
function parseToolStatus(content: string): { name: string; done: boolean }[] {
try {
const parsed = JSON.parse(content);
return parsed.tools || [];
} catch {
return [];
}
}
/**
* Strip markdown formatting so the collapsed preview is a single
* readable line instead of a scatter of code pills.
*
* MarkdownContent turns every backtick-wrapped fragment into its own
* visually-boxed inline-code pill. In a worker text message those
* pills can be coordinates, UUIDs, selectors, tool names — the
* collapsed preview ends up looking like confetti. We just want the
* plain prose, one line, truncated.
*/
function stripMarkdownToPreview(s: string, maxLen = 160): string {
const cleaned = s
.replace(/```[\s\S]*?```/g, " [code] ") // fenced code blocks
.replace(/`([^`]+)`/g, "$1") // inline code — keep the text, drop the backticks
.replace(/\*\*([^*]+)\*\*/g, "$1") // bold
.replace(/\*([^*]+)\*/g, "$1") // italic
.replace(/~~([^~]+)~~/g, "$1") // strikethrough
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1") // links -> link text
.replace(/^#{1,6}\s+/gm, "") // ATX headers
.replace(/^[>\-*+]\s+/gm, "") // blockquote/list markers
.replace(/\s+/g, " ") // collapse whitespace
.trim();
if (cleaned.length <= maxLen) return cleaned;
return cleaned.slice(0, maxLen - 1).trimEnd() + "\u2026";
}
/**
* Collapsible card that groups all worker messages from a single run
* (the span between the queen's `run_agent_with_input` call and the
* worker's final `set_output`/`escalate`/idle).
*
* Collapsed (default): header bar with tool count + latest text snippet.
* Expanded: scrollable list of every message and tool status in order.
*/
const WorkerRunBubble = memo(
function WorkerRunBubble({ group, label }: WorkerRunBubbleProps) {
const [expanded, setExpanded] = useState(false);
const bodyRef = useRef<HTMLDivElement>(null);
// Separate text messages from tool status
const textMsgs = group.messages.filter(
(m) => m.type !== "tool_status" && m.content?.trim()
);
const toolStatusMsgs = group.messages.filter(
(m) => m.type === "tool_status"
);
// Count total tool calls from tool_status messages
const allTools: { name: string; done: boolean }[] = [];
for (const m of toolStatusMsgs) {
for (const t of parseToolStatus(m.content)) {
allTools.push(t);
}
}
const toolCount = allTools.length;
const doneCount = allTools.filter((t) => t.done).length;
const isFinished = toolCount > 0 && doneCount === toolCount;
// Latest text from the worker (the last non-empty text message)
const latestText = textMsgs.length > 0
? textMsgs[textMsgs.length - 1].content
: "";
// Status label. We prefer concrete states over the vague
// "starting" fallback — if the worker has emitted any text or
// any tool, it's past the startup phase.
const statusLabel = isFinished
? "done"
: toolCount > 0
? "running"
: textMsgs.length > 0
? "active"
: "starting";
// Unique tool names for the summary (deduplicated, ordered by first appearance)
const uniqueToolNames: string[] = [];
const seen = new Set<string>();
for (const t of allTools) {
if (!seen.has(t.name)) {
seen.add(t.name);
uniqueToolNames.push(t.name);
}
}
// Auto-scroll body when expanded
useEffect(() => {
if (expanded && bodyRef.current) {
bodyRef.current.scrollTop = bodyRef.current.scrollHeight;
}
}, [expanded, group.messages.length]);
return (
<div className="flex gap-3">
{/* Left icon */}
<div
className="flex-shrink-0 w-7 h-7 rounded-xl flex items-center justify-center mt-1"
style={{
backgroundColor: `${workerColor}18`,
border: `1.5px solid ${workerColor}35`,
}}
>
<Cpu className="w-3.5 h-3.5" style={{ color: workerColor }} />
</div>
<div className="flex-1 min-w-0 max-w-[90%]">
{/* Clickable header */}
<button
onClick={() => setExpanded((v) => !v)}
className="w-full flex items-center gap-2 mb-1 text-left cursor-pointer group"
>
<span className="font-medium text-xs" style={{ color: workerColor }}>
Worker
</span>
{label && (
<span className="text-[10px] font-mono text-muted-foreground/80 tabular-nums">
{label}
</span>
)}
<span
className={`text-[10px] font-medium px-1.5 py-0.5 rounded-md ${
isFinished
? "bg-green-100 text-green-700 dark:bg-green-900/30 dark:text-green-400"
: "bg-muted text-muted-foreground"
}`}
>
{statusLabel}
</span>
{toolCount > 0 && (
<span className="text-[10px] text-muted-foreground tabular-nums">
{doneCount}/{toolCount} tools
</span>
)}
<span className="ml-auto text-muted-foreground/60 group-hover:text-muted-foreground transition-colors p-0.5 rounded">
{expanded ? (
<ChevronUp className="w-3.5 h-3.5" />
) : (
<ChevronDown className="w-3.5 h-3.5" />
)}
</span>
</button>
{/* Card body — use Tailwind theme tokens so dark mode
gets a proper dark background instead of a glaring
near-white hardcoded hsl. Finished runs get a subtle
green tint that also respects theme. */}
<div
className={`rounded-2xl rounded-tl-md overflow-hidden border ${
isFinished
? "border-green-300/50 bg-green-50/50 dark:border-green-900/40 dark:bg-green-950/20"
: "border-border bg-muted/60"
}`}
>
{/* Collapsed: single-line plain-text preview of the
latest worker text, OR a tool-name chain when the
worker hasn't emitted any prose yet. MarkdownContent
is intentionally NOT used here — its inline-code
rendering turns every backtick-wrapped fragment into
a floating pill, which wrecks the preview. */}
{!expanded && (
<div className="px-4 py-2.5 text-sm text-muted-foreground">
{latestText ? (
<div className="truncate">
{stripMarkdownToPreview(latestText)}
</div>
) : uniqueToolNames.length > 0 ? (
<span className="text-xs font-mono truncate block">
{uniqueToolNames.slice(0, 5).join(" \u2192 ")}
{uniqueToolNames.length > 5 &&
` + ${uniqueToolNames.length - 5} more`}
</span>
) : (
<span className="text-xs text-muted-foreground/60 italic">
{"waiting for first action\u2026"}
</span>
)}
</div>
)}
{/* Expanded: chronological stream with tool bursts
coalesced into a single ToolActivityRow each.
Consecutive tool_status messages (no text between)
collapse to the LATEST snapshot — each snapshot is
cumulative within its turn, so the latest one tells
the whole story for that burst. Text messages break
the burst and render as markdown. */}
{expanded && (
<div
ref={bodyRef}
className="max-h-[400px] overflow-y-auto px-4 py-3 space-y-3"
>
{(() => {
type RenderRow =
| { kind: "tools"; content: string; key: string }
| { kind: "text"; msg: ChatMessage; key: string };
const rows: RenderRow[] = [];
let pendingTool: { content: string; id: string } | null = null;
const flushTool = () => {
if (pendingTool) {
rows.push({
kind: "tools",
content: pendingTool.content,
key: `tools-${pendingTool.id}`,
});
pendingTool = null;
}
};
for (let i = 0; i < group.messages.length; i++) {
const m = group.messages[i];
if (m.type === "tool_status") {
// Overwrite — latest snapshot in the burst wins
pendingTool = {
content: m.content,
id: m.id || `ts-${i}`,
};
continue;
}
if (m.content?.trim()) {
flushTool();
rows.push({
kind: "text",
msg: m,
key: m.id || `txt-${i}`,
});
}
}
flushTool();
return rows.map((row) => {
if (row.kind === "tools") {
// ToolActivityRow groups by tool name (×N), shows
// running pills (spinner) before done pills (check),
// and uses the per-tool color hash that matches
// the rest of the chat.
return (
<div key={row.key} className="-ml-10">
<ToolActivityRow content={row.content} />
</div>
);
}
return (
<div
key={row.key}
className="text-sm leading-relaxed"
>
<MarkdownContent content={row.msg.content} />
</div>
);
});
})()}
</div>
)}
</div>
</div>
</div>
);
},
(prev, next) =>
prev.runId === next.runId &&
prev.label === next.label &&
prev.group.messages.length === next.group.messages.length &&
prev.group.messages[prev.group.messages.length - 1]?.content ===
next.group.messages[next.group.messages.length - 1]?.content
);
export default WorkerRunBubble;
@@ -0,0 +1,31 @@
import { createContext, useContext, useCallback, type ReactNode } from "react";
interface QueenProfileContextValue {
openQueenProfile: (queenId: string) => void;
}
const QueenProfileContext = createContext<QueenProfileContextValue | null>(null);
export function QueenProfileProvider({
onOpen,
children,
}: {
onOpen: (queenId: string) => void;
children: ReactNode;
}) {
const openQueenProfile = useCallback(
(queenId: string) => onOpen(queenId),
[onOpen],
);
return (
<QueenProfileContext.Provider value={{ openQueenProfile }}>
{children}
</QueenProfileContext.Provider>
);
}
export function useQueenProfile() {
const ctx = useContext(QueenProfileContext);
if (!ctx) throw new Error("useQueenProfile must be used within QueenProfileProvider");
return ctx;
}
+27 -19
View File
@@ -1,10 +1,11 @@
import { useEffect, useState } from "react";
import { useEffect, useState, useCallback } from "react";
import { Outlet, useLocation } from "react-router-dom";
import Sidebar from "@/components/Sidebar";
import AppHeader from "@/components/AppHeader";
import QueenProfilePanel from "@/components/QueenProfilePanel";
import { ColonyProvider, useColony } from "@/context/ColonyContext";
import { HeaderActionsProvider } from "@/context/HeaderActionsContext";
import { QueenProfileProvider } from "@/context/QueenProfileContext";
export default function AppLayout() {
return (
@@ -27,26 +28,33 @@ function AppLayoutInner() {
setOpenQueenId(null);
}, [location.pathname]);
const handleOpenQueenProfile = useCallback(
(queenId: string) => setOpenQueenId((prev) => (prev === queenId ? null : queenId)),
[],
);
return (
<div className="flex h-screen bg-background overflow-hidden">
<Sidebar />
<div className="flex-1 min-w-0 flex flex-col">
<AppHeader onOpenQueenProfile={setOpenQueenId} />
<div className="flex-1 min-h-0 flex">
<main className="flex-1 min-w-0 flex flex-col">
<Outlet />
</main>
{openQueenId && (
<QueenProfilePanel
queenId={openQueenId}
colonies={colonies.filter(
(c) => c.queenProfileId === openQueenId,
)}
onClose={() => setOpenQueenId(null)}
/>
)}
<QueenProfileProvider onOpen={handleOpenQueenProfile}>
<div className="flex h-screen bg-background overflow-hidden">
<Sidebar />
<div className="flex-1 min-w-0 flex flex-col">
<AppHeader onOpenQueenProfile={handleOpenQueenProfile} />
<div className="flex-1 min-h-0 flex">
<main className="flex-1 min-w-0 flex flex-col">
<Outlet />
</main>
{openQueenId && (
<QueenProfilePanel
queenId={openQueenId}
colonies={colonies.filter(
(c) => c.queenProfileId === openQueenId,
)}
onClose={() => setOpenQueenId(null)}
/>
)}
</div>
</div>
</div>
</div>
</QueenProfileProvider>
);
}
+187
View File
@@ -119,6 +119,7 @@ export function sseEventToChatMessage(
createdAt,
nodeId: event.node_id || undefined,
executionId: event.execution_id || undefined,
streamId: event.stream_id || undefined,
};
}
@@ -138,6 +139,7 @@ export function sseEventToChatMessage(
type: "user",
thread,
createdAt,
streamId: event.stream_id || undefined,
};
}
@@ -158,6 +160,7 @@ export function sseEventToChatMessage(
createdAt,
nodeId: event.node_id || undefined,
executionId: event.execution_id || undefined,
streamId: event.stream_id || undefined,
};
}
@@ -172,6 +175,7 @@ export function sseEventToChatMessage(
type: "system",
thread,
createdAt,
streamId: event.stream_id || undefined,
};
}
@@ -186,6 +190,7 @@ export function sseEventToChatMessage(
type: "system",
thread,
createdAt,
streamId: event.stream_id || undefined,
};
}
@@ -194,6 +199,188 @@ export function sseEventToChatMessage(
}
}
// ---------------------------------------------------------------------------
// Stateful event replay — produces tool_status pills + regular messages
// ---------------------------------------------------------------------------
/**
* State maintained while replaying an event stream. Tracks per-stream turn
* counters, the set of active tool calls (so tool_status pill content
* reflects "tool A done, tool B running" correctly), and a tool_use_id →
* pill_msg_id map so deferred `tool_call_completed` events can find the
* pill they belong to after the turn counter moves on.
*/
export interface ReplayState {
turnCounters: Record<string, number>;
activeToolCalls: Record<
string,
{ name: string; done: boolean; streamId: string }
>;
toolUseToPill: Record<string, { msgId: string; name: string }>;
}
export function newReplayState(): ReplayState {
return { turnCounters: {}, activeToolCalls: {}, toolUseToPill: {} };
}
/**
* Process a single event and emit zero or more ChatMessage upserts.
*
* Why this exists: `sseEventToChatMessage` is stateless — one event in, at
* most one message out. But the chat's tool_status pill is a SYNTHESIZED
* message: each tool_call_started adds to an accumulating pill, and each
* tool_call_completed flips one of its tools from running to done. Live
* SSE handlers in colony-chat and queen-dm already do this synthesis
* against React refs. Cold-restore from events.jsonl used to skip
* tool_call_* events entirely, so refreshed sessions looked completely
* different from live ones — no tool activity visible, just prose.
*
* This function centralizes the synthesis so cold-restore and live paths
* can use the exact same state machine. The caller treats the returned
* messages as upserts (by id) — a later event in the same replay may
* emit the same pill id with updated content, which should REPLACE the
* earlier row in the caller's message list.
*/
export function replayEvent(
state: ReplayState,
event: AgentEvent,
thread: string,
agentDisplayName: string | undefined,
): ChatMessage[] {
const streamId = event.stream_id;
const isQueen = streamId === "queen";
const role: "queen" | "worker" = isQueen ? "queen" : "worker";
const turnKey = streamId;
const currentTurn = state.turnCounters[turnKey] ?? 0;
const eventCreatedAt = event.timestamp
? new Date(event.timestamp).getTime()
: Date.now();
const out: ChatMessage[] = [];
// Update state machine BEFORE the generic converter runs so the
// regular message emitted for this event sees the post-update
// counter (matches live handler ordering at colony-chat.tsx:525).
switch (event.type) {
case "execution_started":
state.turnCounters[turnKey] = currentTurn + 1;
// New execution for a worker resets its active tools, mirroring
// the live handler's setAgentState at colony-chat.tsx:566.
if (!isQueen) {
const keepActive: typeof state.activeToolCalls = {};
for (const [k, v] of Object.entries(state.activeToolCalls)) {
if (v.streamId !== streamId) keepActive[k] = v;
}
state.activeToolCalls = keepActive;
}
break;
case "llm_turn_complete":
state.turnCounters[turnKey] = currentTurn + 1;
break;
case "tool_call_started": {
if (!event.node_id) break;
const toolName = (event.data?.tool_name as string) || "unknown";
const toolUseId = (event.data?.tool_use_id as string) || "";
state.activeToolCalls[toolUseId] = {
name: toolName,
done: false,
streamId,
};
const pillId = `tool-pill-${streamId}-${event.execution_id || "exec"}-${currentTurn}`;
if (toolUseId) {
state.toolUseToPill[toolUseId] = { msgId: pillId, name: toolName };
}
const tools = Object.values(state.activeToolCalls)
.filter((t) => t.streamId === streamId)
.map((t) => ({ name: t.name, done: t.done }));
const allDone = tools.length > 0 && tools.every((t) => t.done);
out.push({
id: pillId,
agent: agentDisplayName || event.node_id || "Agent",
agentColor: "",
content: JSON.stringify({ tools, allDone }),
timestamp: "",
type: "tool_status",
role,
thread,
createdAt: eventCreatedAt,
nodeId: event.node_id || undefined,
executionId: event.execution_id || undefined,
streamId: streamId || undefined,
});
break;
}
case "tool_call_completed": {
if (!event.node_id) break;
const toolUseId = (event.data?.tool_use_id as string) || "";
const tracked = state.toolUseToPill[toolUseId];
if (toolUseId) delete state.toolUseToPill[toolUseId];
if (toolUseId && state.activeToolCalls[toolUseId]) {
state.activeToolCalls[toolUseId].done = true;
}
if (!tracked) break;
const tools = Object.values(state.activeToolCalls)
.filter((t) => t.streamId === streamId)
.map((t) => ({ name: t.name, done: t.done }));
const allDone = tools.length > 0 && tools.every((t) => t.done);
// Re-emit the SAME pill id with updated content. Caller upserts
// by id, so this replaces the row from tool_call_started.
out.push({
id: tracked.msgId,
agent: agentDisplayName || event.node_id || "Agent",
agentColor: "",
content: JSON.stringify({ tools, allDone }),
timestamp: "",
type: "tool_status",
role,
thread,
createdAt: eventCreatedAt,
nodeId: event.node_id || undefined,
executionId: event.execution_id || undefined,
streamId: streamId || undefined,
});
break;
}
}
// Regular stateless conversion (prose, user input, system notes).
const msg = sseEventToChatMessage(
event,
thread,
agentDisplayName,
state.turnCounters[turnKey] ?? 0,
);
if (msg) {
if (isQueen) msg.role = "queen";
out.push(msg);
}
return out;
}
/**
* Replay an entire event array and return a deduplicated, chronologically
* sorted ChatMessage list. Used by cold-restore paths so refreshed
* sessions match the live stream exactly.
*/
export function replayEventsToMessages(
events: AgentEvent[],
thread: string,
agentDisplayName: string | undefined,
): ChatMessage[] {
const state = newReplayState();
// Upsert by id — later emissions for the same pill replace earlier ones.
const byId = new Map<string, ChatMessage>();
for (const evt of events) {
for (const m of replayEvent(state, evt, thread, agentDisplayName)) {
byId.set(m.id, m);
}
}
return Array.from(byId.values()).sort(
(a, b) => (a.createdAt ?? 0) - (b.createdAt ?? 0),
);
}
type QueenPhase = "planning" | "building" | "staging" | "running" | "independent";
const VALID_PHASES = new Set<string>(["planning", "building", "staging", "running", "independent"]);
+54 -14
View File
@@ -13,7 +13,11 @@ import { executionApi } from "@/api/execution";
import { sessionsApi } from "@/api/sessions";
import { useMultiSSE } from "@/hooks/use-sse";
import type { LiveSession, AgentEvent } from "@/api/types";
import { sseEventToChatMessage, formatAgentDisplayName } from "@/lib/chat-helpers";
import {
sseEventToChatMessage,
formatAgentDisplayName,
replayEventsToMessages,
} from "@/lib/chat-helpers";
import { cronToLabel } from "@/lib/graphUtils";
import { ApiError } from "@/api/client";
import { useColony } from "@/context/ColonyContext";
@@ -41,6 +45,8 @@ function truncate(s: string, max: number): string {
type SessionRestoreResult = {
messages: ChatMessage[];
restoredPhase: "planning" | "building" | "staging" | "running" | "independent" | null;
truncated: boolean;
droppedCount: number;
};
async function restoreSessionMessages(
@@ -49,34 +55,67 @@ async function restoreSessionMessages(
agentDisplayName: string,
): Promise<SessionRestoreResult> {
try {
const { events } = await sessionsApi.eventsHistory(sessionId);
const { events, truncated, total, returned } =
await sessionsApi.eventsHistory(sessionId);
if (events.length > 0) {
const messages: ChatMessage[] = [];
// Walk events twice:
// 1. Extract the trailing queen phase (unchanged logic).
// 2. Run the full state-machine replay so tool_status pills
// are synthesized just like the live SSE handler does.
// Without (2), refreshed sessions showed zero tool activity
// because tool_call_started/completed events are ignored by
// the stateless converter.
let runningPhase: ChatMessage["phase"] = undefined;
for (const evt of events) {
const p =
evt.type === "queen_phase_changed"
? (evt.data?.phase as string)
: evt.type === "node_loop_iteration"
? (evt.data?.phase as string | undefined)
: undefined;
? (evt.data?.phase as string | undefined)
: undefined;
if (p && ["planning", "building", "staging", "running"].includes(p)) {
runningPhase = p as ChatMessage["phase"];
}
const msg = sseEventToChatMessage(evt, thread, agentDisplayName);
if (!msg) continue;
if (evt.stream_id === "queen") {
msg.role = "queen";
msg.phase = runningPhase;
}
messages.push(msg);
}
return { messages, restoredPhase: runningPhase ?? null };
const messages = replayEventsToMessages(events, thread, agentDisplayName);
// Stamp the latest phase on every queen message so the UI's
// phase-badge rendering matches what the live path would have
// displayed at the time of the refresh.
if (runningPhase) {
for (const m of messages) {
if (m.role === "queen") m.phase = runningPhase;
}
}
// Prepend a run_divider banner when the server truncated older
// events so the user knows how many are hidden.
const droppedCount = Math.max(0, total - returned);
if (truncated && droppedCount > 0) {
const firstTs = events[0]?.timestamp;
const bannerCreatedAt = firstTs ? new Date(firstTs).getTime() - 1 : 0;
messages.unshift({
id: `restore-truncated-${sessionId}`,
agent: "System",
agentColor: "",
type: "run_divider",
content: `${droppedCount.toLocaleString()} older event${droppedCount === 1 ? "" : "s"} not shown (showing last ${returned.toLocaleString()})`,
timestamp: firstTs ?? new Date().toISOString(),
thread,
createdAt: bannerCreatedAt,
});
}
return {
messages,
restoredPhase: runningPhase ?? null,
truncated,
droppedCount,
};
}
} catch {
// Event log not available
}
return { messages: [], restoredPhase: null };
return { messages: [], restoredPhase: null, truncated: false, droppedCount: 0 };
}
// ── Agent backend state ──────────────────────────────────────────────────────
@@ -816,6 +855,7 @@ export default function ColonyChat() {
createdAt: eventCreatedAt,
nodeId: event.node_id || undefined,
executionId: event.execution_id || undefined,
streamId: sid || undefined,
});
return { ...prev, isStreaming: false, activeToolCalls: newActive };
});
+30 -9
View File
@@ -11,7 +11,10 @@ import { sessionsApi } from "@/api/sessions";
import { queensApi } from "@/api/queens";
import { useMultiSSE } from "@/hooks/use-sse";
import type { AgentEvent, HistorySession } from "@/api/types";
import { sseEventToChatMessage } from "@/lib/chat-helpers";
import {
sseEventToChatMessage,
replayEventsToMessages,
} from "@/lib/chat-helpers";
import { useColony } from "@/context/ColonyContext";
import { useHeaderActions } from "@/context/HeaderActionsContext";
import { getQueenForAgent, slugToColonyId } from "@/lib/colony-registry";
@@ -90,17 +93,34 @@ export default function QueenDM() {
const restoreMessages = useCallback(
async (sid: string, cancelled: () => boolean) => {
try {
const { events } = await sessionsApi.eventsHistory(sid);
const { events, truncated, total, returned } =
await sessionsApi.eventsHistory(sid);
if (cancelled()) return;
const restored: ChatMessage[] = [];
for (const evt of events) {
const msg = sseEventToChatMessage(evt, "queen-dm", queenName);
if (!msg) continue;
if (evt.stream_id === "queen") msg.role = "queen";
restored.push(msg);
// Use the stateful replay so tool_status pills are synthesized
// the same way the live SSE handler does — without this the
// refreshed queen DM shows zero tool activity.
const restored = replayEventsToMessages(events, "queen-dm", queenName);
// Show a banner if the server truncated older events.
const droppedCount = Math.max(0, total - returned);
if (truncated && droppedCount > 0) {
const firstTs = events[0]?.timestamp;
const bannerCreatedAt = firstTs
? new Date(firstTs).getTime() - 1
: 0;
restored.unshift({
id: `restore-truncated-${sid}`,
agent: "System",
agentColor: "",
type: "run_divider",
content: `${droppedCount.toLocaleString()} older event${droppedCount === 1 ? "" : "s"} not shown (showing last ${returned.toLocaleString()})`,
timestamp: firstTs ?? new Date().toISOString(),
thread: "queen-dm",
createdAt: bannerCreatedAt,
});
}
if (restored.length > 0 && !cancelled()) {
restored.sort((a, b) => (a.createdAt ?? 0) - (b.createdAt ?? 0));
setMessages(restored);
// Only clear typing if the history contains a completed execution;
// during bootstrap the queen is still processing.
@@ -601,6 +621,7 @@ export default function QueenDM() {
createdAt: eventCreatedAt,
nodeId: event.node_id || undefined,
executionId: event.execution_id || undefined,
streamId: sid || undefined,
};
setMessages((prevMsgs) => {
const idx = prevMsgs.findIndex((m) => m.id === msgId);
+17 -2
View File
@@ -72,13 +72,28 @@ def patched_fork(monkeypatch):
"""Stub out fork_session_into_colony so we don't need a real queen."""
calls: list[dict] = []
async def _stub_fork(*, session: Any, colony_name: str, task: str) -> dict:
calls.append({"session": session, "colony_name": colony_name, "task": task})
async def _stub_fork(
*,
session: Any,
colony_name: str,
task: str,
tasks: list[dict] | None = None,
) -> dict:
calls.append(
{
"session": session,
"colony_name": colony_name,
"task": task,
"tasks": tasks,
}
)
return {
"colony_path": f"/tmp/fake_colonies/{colony_name}",
"colony_name": colony_name,
"queen_session_id": "session_fake_fork_id",
"is_new": True,
"db_path": f"/tmp/fake_colonies/{colony_name}/data/progress.db",
"task_ids": [],
}
monkeypatch.setattr(
+28 -42
View File
@@ -17,10 +17,10 @@ _DEFAULT_SKILLS_DIR = Path(__file__).resolve().parent.parent / "framework" / "sk
class TestDefaultSkillFiles:
"""Verify all 7 built-in SKILL.md files parse correctly."""
"""Verify all built-in SKILL.md files parse correctly."""
def test_all_seven_skills_exist(self):
assert len(SKILL_REGISTRY) == 7
def test_all_skills_exist(self):
assert len(SKILL_REGISTRY) == 6
@pytest.mark.parametrize("skill_name,dir_name", list(SKILL_REGISTRY.items()))
def test_skill_parses(self, skill_name, dir_name):
@@ -35,7 +35,13 @@ class TestDefaultSkillFiles:
assert parsed.source_scope == "framework"
def test_combined_token_budget(self):
"""All default skill bodies combined should be under 3000 tokens (~12000 chars)."""
"""All default skill bodies combined should stay within the protocols budget.
Ceiling is 5000 tokens (~20000 chars): the prompt-injection path
appends every registered skill body to the system prompt, so
uncontrolled growth would balloon every LLM call. 5000 gives
headroom over today's ~3500 while still catching obvious bloat.
"""
total_chars = 0
for dir_name in SKILL_REGISTRY.values():
path = _DEFAULT_SKILLS_DIR / dir_name / "SKILL.md"
@@ -44,9 +50,9 @@ class TestDefaultSkillFiles:
total_chars += len(parsed.body)
approx_tokens = total_chars // 4
assert approx_tokens < 3000, (
assert approx_tokens < 5000, (
f"Combined default skill bodies are ~{approx_tokens} tokens "
f"({total_chars} chars), exceeding the 3000 token budget"
f"({total_chars} chars), exceeding the 5000 token budget"
)
def test_data_buffer_keys_all_prefixed(self):
@@ -60,7 +66,7 @@ class TestDefaultSkillManager:
manager = DefaultSkillManager()
manager.load()
assert len(manager.active_skill_names) == 7
assert len(manager.active_skill_names) == len(SKILL_REGISTRY)
for name in SKILL_REGISTRY:
assert name in manager.active_skill_names
@@ -97,7 +103,7 @@ class TestDefaultSkillManager:
manager.load()
assert "hive.quality-monitor" not in manager.active_skill_names
assert len(manager.active_skill_names) == 6
assert len(manager.active_skill_names) == len(SKILL_REGISTRY) - 1
def test_disable_all_via_convention(self):
config = SkillsConfig.from_agent_vars(default_skills={"_all": {"enabled": False}})
@@ -136,7 +142,7 @@ class TestSkillsConfig:
def test_explicit_disable(self):
config = SkillsConfig(default_skills={"hive.note-taking": DefaultSkillConfig(enabled=False)})
assert config.is_default_enabled("hive.note-taking") is False
assert config.is_default_enabled("hive.batch-ledger") is True
assert config.is_default_enabled("hive.quality-monitor") is True
def test_all_disabled_flag(self):
config = SkillsConfig(all_defaults_disabled=True)
@@ -166,11 +172,11 @@ class TestSkillsConfig:
def test_get_default_overrides(self):
config = SkillsConfig.from_agent_vars(
default_skills={
"hive.batch-ledger": {"enabled": True, "checkpoint_every_n": 10},
"hive.quality-monitor": {"enabled": True, "assessment_interval": 10},
}
)
overrides = config.get_default_overrides("hive.batch-ledger")
assert overrides == {"checkpoint_every_n": 10}
overrides = config.get_default_overrides("hive.quality-monitor")
assert overrides == {"assessment_interval": 10}
def test_get_default_overrides_empty(self):
config = SkillsConfig()
@@ -244,40 +250,20 @@ class TestConfigOverrideSubstitution:
assert "{{" not in cleaned
class TestBatchAutoDetection:
"""DS-12: is_batch_scenario() and batch_init_nudge property."""
class TestBatchDeprecatedNoOps:
"""batch-ledger skill was removed; is_batch_scenario() and batch_init_nudge
are deprecated no-ops that return False / None unconditionally. They are
kept in-tree to avoid touching every orchestrator/execution_manager call
site that still reads the nudge through the config plumbing."""
def test_detects_list_of(self):
assert is_batch_scenario("process a list of 100 leads") is True
def test_is_batch_scenario_always_false(self):
assert is_batch_scenario("process a list of 100 leads") is False
assert is_batch_scenario("for each record, send an email") is False
assert is_batch_scenario("write a summary") is False
def test_detects_collection_of(self):
assert is_batch_scenario("a collection of invoices") is True
def test_detects_items(self):
assert is_batch_scenario("go through all items in the spreadsheet") is True
def test_detects_for_each(self):
assert is_batch_scenario("for each record, send an email") is True
def test_no_match_single_task(self):
assert is_batch_scenario("write a summary of the quarterly report") is False
def test_batch_nudge_active_by_default(self):
def test_batch_init_nudge_always_none(self):
manager = DefaultSkillManager()
manager.load()
assert manager.batch_init_nudge is not None
assert "_batch_ledger" in manager.batch_init_nudge
def test_batch_nudge_none_when_skill_disabled(self):
config = SkillsConfig.from_agent_vars(default_skills={"hive.batch-ledger": {"enabled": False}})
manager = DefaultSkillManager(config)
manager.load()
assert manager.batch_init_nudge is None
def test_batch_nudge_none_when_auto_detect_disabled(self):
config = SkillsConfig.from_agent_vars(default_skills={"hive.batch-ledger": {"auto_detect_batch": False}})
manager = DefaultSkillManager(config)
manager.load()
assert manager.batch_init_nudge is None
+590
View File
@@ -0,0 +1,590 @@
"""Tests for framework.host.progress_db — per-colony task queue."""
from __future__ import annotations
import sqlite3
import threading
import time
from pathlib import Path
import pytest
from framework.host.progress_db import (
SCHEMA_VERSION,
ensure_all_colony_dbs,
ensure_progress_db,
enqueue_task,
reclaim_stale,
seed_tasks,
)
# ----------------------------------------------------------------------
# Schema / init
# ----------------------------------------------------------------------
def test_ensure_progress_db_fresh(tmp_path: Path) -> None:
colony = tmp_path / "c"
db_path = ensure_progress_db(colony)
assert db_path.exists()
assert db_path.name == "progress.db"
assert db_path.parent.name == "data"
con = sqlite3.connect(str(db_path))
try:
assert con.execute("PRAGMA journal_mode").fetchone()[0].lower() == "wal"
assert con.execute("PRAGMA user_version").fetchone()[0] == SCHEMA_VERSION
tables = {r[0] for r in con.execute("SELECT name FROM sqlite_master WHERE type='table'")}
assert {"tasks", "steps", "sop_checklist", "colony_meta"}.issubset(tables)
indexes = {r[0] for r in con.execute("SELECT name FROM sqlite_master WHERE type='index'")}
# Named indexes we declared
assert "idx_tasks_claimable" in indexes
assert "idx_steps_task_seq" in indexes
assert "idx_sop_required_open" in indexes
assert "idx_tasks_status" in indexes
finally:
con.close()
def test_ensure_progress_db_idempotent(tmp_path: Path) -> None:
colony = tmp_path / "c"
p1 = ensure_progress_db(colony)
p2 = ensure_progress_db(colony)
assert p1 == p2
con = sqlite3.connect(str(p1))
try:
assert con.execute("PRAGMA user_version").fetchone()[0] == SCHEMA_VERSION
finally:
con.close()
def test_ensure_all_colony_dbs_backfill(tmp_path: Path) -> None:
colonies_root = tmp_path / "colonies"
(colonies_root / "alpha").mkdir(parents=True)
(colonies_root / "beta").mkdir(parents=True)
(colonies_root / "gamma_not_dir").touch() # should be ignored
initialized = ensure_all_colony_dbs(colonies_root)
names = {p.parent.parent.name for p in initialized}
assert names == {"alpha", "beta"}
for p in initialized:
assert p.exists()
def test_ensure_all_colony_dbs_missing_root(tmp_path: Path) -> None:
missing = tmp_path / "nonexistent"
assert ensure_all_colony_dbs(missing) == []
# ----------------------------------------------------------------------
# Seeding / enqueue
# ----------------------------------------------------------------------
def test_seed_tasks_basic(tmp_path: Path) -> None:
db = ensure_progress_db(tmp_path / "c")
ids = seed_tasks(
db,
[
{
"goal": "task one",
"priority": 5,
"payload": {"url": "https://example.com"},
"steps": [
{"title": "open page"},
{"title": "extract data", "detail": "selector .content"},
],
"sop_items": [
{"key": "captcha_handled", "description": "Verify no CAPTCHA blocks"},
{"key": "soft_hint", "description": "optional", "required": False},
],
},
{"goal": "task two"},
],
)
assert len(ids) == 2
con = sqlite3.connect(str(db))
try:
rows = list(con.execute("SELECT id, goal, priority, status, source, payload FROM tasks ORDER BY goal"))
assert len(rows) == 2
assert rows[0][1] == "task one"
assert rows[0][2] == 5
assert rows[0][3] == "pending"
assert rows[0][4] == "queen_create"
assert '"url"' in rows[0][5]
step_count = con.execute(
"SELECT count(*) FROM steps WHERE task_id=?", (ids[0],)
).fetchone()[0]
assert step_count == 2
sop_rows = list(con.execute(
"SELECT key, required FROM sop_checklist WHERE task_id=? ORDER BY key", (ids[0],)
))
assert sop_rows == [("captcha_handled", 1), ("soft_hint", 0)]
finally:
con.close()
def test_seed_tasks_rejects_missing_goal(tmp_path: Path) -> None:
db = ensure_progress_db(tmp_path / "c")
with pytest.raises(ValueError):
seed_tasks(db, [{"priority": 1}])
def test_seed_tasks_empty_is_noop(tmp_path: Path) -> None:
db = ensure_progress_db(tmp_path / "c")
assert seed_tasks(db, []) == []
def test_seed_tasks_rollback_on_partial_failure(tmp_path: Path) -> None:
"""A bad row mid-batch must roll back the whole transaction."""
db = ensure_progress_db(tmp_path / "c")
with pytest.raises(ValueError):
seed_tasks(
db,
[
{"goal": "good one"},
{"priority": 1}, # missing goal -> boom
{"goal": "never inserted"},
],
)
con = sqlite3.connect(str(db))
try:
count = con.execute("SELECT count(*) FROM tasks").fetchone()[0]
assert count == 0
finally:
con.close()
def test_enqueue_task(tmp_path: Path) -> None:
db = ensure_progress_db(tmp_path / "c")
tid = enqueue_task(
db,
"appended",
steps=[{"title": "s1"}],
sop_items=[{"key": "k", "description": "d"}],
priority=3,
)
assert tid
con = sqlite3.connect(str(db))
try:
row = con.execute(
"SELECT goal, priority, source FROM tasks WHERE id=?", (tid,)
).fetchone()
assert row == ("appended", 3, "enqueue_tool")
assert con.execute(
"SELECT count(*) FROM steps WHERE task_id=?", (tid,)
).fetchone()[0] == 1
finally:
con.close()
def test_enqueue_task_custom_source(tmp_path: Path) -> None:
"""enqueue_task must accept a custom source value (e.g. run_agent_with_input).
Phase 2 wiring adds source values: create_colony_auto,
run_agent_with_input, run_parallel_workers. Verify the source
column stores them verbatim.
"""
db = ensure_progress_db(tmp_path / "c")
tid = enqueue_task(db, "chat task", source="run_agent_with_input")
con = sqlite3.connect(str(db))
try:
row = con.execute("SELECT goal, source FROM tasks WHERE id=?", (tid,)).fetchone()
assert row == ("chat task", "run_agent_with_input")
finally:
con.close()
def test_claim_by_assigned_id(tmp_path: Path) -> None:
"""Worker protocol: claim a specific row by id (not the generic next-pending).
The Phase 2 fix threads ``task_id`` into ``input_data`` when the
queen pre-assigns a row. The worker must be able to claim THAT
row atomically with an ``UPDATE ... WHERE id=? AND status='pending'``
pattern, and a second claim on the same id must return 0 rows.
"""
db = ensure_progress_db(tmp_path / "c")
[tid] = seed_tasks(db, [{"goal": "pinned task"}])
con = sqlite3.connect(str(db), isolation_level=None, timeout=5.0)
try:
cur = con.execute(
"""
UPDATE tasks SET status='claimed', worker_id=?,
claim_token=lower(hex(randomblob(8))),
claimed_at=datetime('now'),
updated_at=datetime('now')
WHERE id=? AND status='pending'
RETURNING id, goal
""",
("w1", tid),
)
row = cur.fetchone()
assert row == (tid, "pinned task"), f"expected one claim, got {row}"
# Second attempt on the same id must affect zero rows.
cur2 = con.execute(
"""
UPDATE tasks SET status='claimed', worker_id=?,
claim_token=lower(hex(randomblob(8))),
claimed_at=datetime('now')
WHERE id=? AND status='pending'
RETURNING id
""",
("w2", tid),
)
assert cur2.fetchone() is None, "second claim should affect zero rows"
# Ensure worker_id on the row is still the first claimant.
owner = con.execute(
"SELECT worker_id, status FROM tasks WHERE id=?", (tid,)
).fetchone()
assert owner == ("w1", "claimed")
finally:
con.close()
def test_claim_by_id_does_not_steal_unrelated_rows(tmp_path: Path) -> None:
"""Claim-by-id must only touch the named row, not siblings."""
db = ensure_progress_db(tmp_path / "c")
ids = seed_tasks(db, [{"goal": "a"}, {"goal": "b"}, {"goal": "c"}])
target = ids[1]
con = sqlite3.connect(str(db), isolation_level=None)
try:
con.execute(
"UPDATE tasks SET status='claimed', worker_id='w1', "
"claimed_at=datetime('now') WHERE id=? AND status='pending'",
(target,),
)
statuses = dict(con.execute("SELECT goal, status FROM tasks").fetchall())
assert statuses == {"a": "pending", "b": "claimed", "c": "pending"}
finally:
con.close()
def test_seed_tasks_bulk_10k(tmp_path: Path) -> None:
"""10k rows in one transaction should finish under a second on local disk."""
db = ensure_progress_db(tmp_path / "c")
tasks = [{"goal": f"task {i}", "seq": i} for i in range(10_000)]
start = time.perf_counter()
ids = seed_tasks(db, tasks)
elapsed = time.perf_counter() - start
assert len(ids) == 10_000
# Generous ceiling — on CI with slow disk we've seen ~300ms.
assert elapsed < 3.0, f"bulk seed too slow: {elapsed:.2f}s"
# ----------------------------------------------------------------------
# Atomic claim under concurrency
# ----------------------------------------------------------------------
_CLAIM_SQL = """
BEGIN IMMEDIATE;
UPDATE tasks
SET
status = 'claimed',
worker_id = ?,
claim_token = lower(hex(randomblob(8))),
claimed_at = datetime('now'),
updated_at = datetime('now')
WHERE id = (
SELECT id FROM tasks
WHERE status = 'pending'
ORDER BY priority DESC, seq, created_at
LIMIT 1
);
"""
def _claim_one(db_path: Path, worker_id: str) -> str | None:
"""Atomic single-shot claim using RETURNING (SQLite 3.35+).
The skill teaches agents the BEGIN IMMEDIATE + subquery UPDATE
pattern; for an in-process test helper we use RETURNING so the
claimed row id is returned from the same statement (no racing
follow-up SELECT). Functionally equivalent: both approaches rely
on the atomic subquery-UPDATE.
"""
con = sqlite3.connect(str(db_path), isolation_level=None, timeout=10.0)
con.execute("PRAGMA busy_timeout = 10000")
try:
cur = con.execute(
"""
UPDATE tasks
SET status = 'claimed',
worker_id = ?,
claim_token = lower(hex(randomblob(8))),
claimed_at = datetime('now'),
updated_at = datetime('now')
WHERE id = (
SELECT id FROM tasks
WHERE status = 'pending'
ORDER BY priority DESC, seq, created_at
LIMIT 1
)
RETURNING id
""",
(worker_id,),
)
row = cur.fetchone()
return row[0] if row else None
finally:
con.close()
def test_claim_atomicity_under_concurrency(tmp_path: Path) -> None:
"""20 threads racing to drain 100 tasks — each task claimed exactly once."""
db = ensure_progress_db(tmp_path / "c")
seed_tasks(db, [{"goal": f"task {i}", "seq": i} for i in range(100)])
claims: list[tuple[str, str]] = []
claims_lock = threading.Lock()
def worker(worker_id: str) -> None:
while True:
tid = _claim_one(db, worker_id)
if tid is None:
return
with claims_lock:
claims.append((worker_id, tid))
threads = [threading.Thread(target=worker, args=(f"w{i}",)) for i in range(20)]
for t in threads:
t.start()
for t in threads:
t.join(timeout=30)
task_ids = [tid for _, tid in claims]
assert len(task_ids) == 100, f"expected 100 claims, got {len(task_ids)}"
assert len(set(task_ids)) == 100, "duplicate claims detected"
con = sqlite3.connect(str(db))
try:
remaining = con.execute(
"SELECT count(*) FROM tasks WHERE status='pending'"
).fetchone()[0]
assert remaining == 0
claimed = con.execute(
"SELECT count(*) FROM tasks WHERE status='claimed'"
).fetchone()[0]
assert claimed == 100
finally:
con.close()
# ----------------------------------------------------------------------
# Stale-claim reclaimer
# ----------------------------------------------------------------------
def test_reclaim_stale_returns_to_pending(tmp_path: Path) -> None:
db = ensure_progress_db(tmp_path / "c")
[tid] = seed_tasks(db, [{"goal": "stuck"}])
# Simulate a claim made 20 minutes ago.
con = sqlite3.connect(str(db), isolation_level=None)
try:
con.execute(
"UPDATE tasks SET status='claimed', worker_id='w1', "
"claimed_at=datetime('now', '-20 minutes') WHERE id=?",
(tid,),
)
finally:
con.close()
reclaimed = reclaim_stale(db, stale_after_minutes=15)
assert reclaimed == 1
con = sqlite3.connect(str(db))
try:
row = con.execute(
"SELECT status, worker_id, retry_count FROM tasks WHERE id=?", (tid,)
).fetchone()
assert row == ("pending", None, 1)
finally:
con.close()
def test_reclaim_stale_fails_after_max_retries(tmp_path: Path) -> None:
db = ensure_progress_db(tmp_path / "c")
[tid] = seed_tasks(db, [{"goal": "doomed", "max_retries": 2}])
con = sqlite3.connect(str(db), isolation_level=None)
try:
con.execute(
"UPDATE tasks SET status='claimed', worker_id='w1', retry_count=2, "
"claimed_at=datetime('now', '-20 minutes') WHERE id=?",
(tid,),
)
finally:
con.close()
reclaim_stale(db, stale_after_minutes=15)
con = sqlite3.connect(str(db))
try:
row = con.execute(
"SELECT status, last_error FROM tasks WHERE id=?", (tid,)
).fetchone()
assert row[0] == "failed"
assert row[1] is not None and "max_retries" in row[1]
finally:
con.close()
def test_reclaim_stale_ignores_fresh_claims(tmp_path: Path) -> None:
db = ensure_progress_db(tmp_path / "c")
[tid] = seed_tasks(db, [{"goal": "working"}])
con = sqlite3.connect(str(db), isolation_level=None)
try:
con.execute(
"UPDATE tasks SET status='claimed', worker_id='w1', "
"claimed_at=datetime('now') WHERE id=?",
(tid,),
)
finally:
con.close()
reclaimed = reclaim_stale(db, stale_after_minutes=15)
assert reclaimed == 0
# ----------------------------------------------------------------------
# Foreign key cascade
# ----------------------------------------------------------------------
# ----------------------------------------------------------------------
# Worker config patching for pre-existing colonies
# ----------------------------------------------------------------------
def _write_worker_cfg(path: Path, *, with_input_data: dict | None = None) -> None:
"""Write a minimal worker.json that matches the shape ensure_progress_db patches."""
import json as _json
cfg = {
"name": "worker",
"system_prompt": "You are a worker.",
"goal": {"description": "do stuff", "success_criteria": [], "constraints": []},
"tools": [],
}
if with_input_data is not None:
cfg["input_data"] = with_input_data
path.write_text(_json.dumps(cfg, indent=2))
def test_ensure_progress_db_patches_existing_worker_json(tmp_path: Path) -> None:
"""Pre-existing worker.json without input_data gets db_path injected."""
import json as _json
colony = tmp_path / "legacy_colony"
colony.mkdir()
_write_worker_cfg(colony / "worker.json")
# Before: no input_data
before = _json.loads((colony / "worker.json").read_text())
assert "input_data" not in before
db = ensure_progress_db(colony)
after = _json.loads((colony / "worker.json").read_text())
assert after["input_data"]["db_path"] == str(db)
assert after["input_data"]["colony_id"] == "legacy_colony"
# Other fields untouched
assert after["system_prompt"] == "You are a worker."
assert after["goal"]["description"] == "do stuff"
def test_ensure_progress_db_patch_is_idempotent(tmp_path: Path) -> None:
"""Second call must not rewrite the file (mtime unchanged)."""
import time as _time
colony = tmp_path / "idem"
colony.mkdir()
_write_worker_cfg(colony / "worker.json")
ensure_progress_db(colony)
mtime1 = (colony / "worker.json").stat().st_mtime
_time.sleep(0.02) # ensure any rewrite would bump mtime
ensure_progress_db(colony)
mtime2 = (colony / "worker.json").stat().st_mtime
assert mtime1 == mtime2, "second ensure_progress_db must not rewrite worker.json"
def test_ensure_progress_db_preserves_existing_input_data_keys(tmp_path: Path) -> None:
"""Pre-existing input_data keys (other than db_path/colony_id) are preserved."""
import json as _json
colony = tmp_path / "preserved"
colony.mkdir()
_write_worker_cfg(
colony / "worker.json",
with_input_data={"custom_key": "hello", "db_path": "/stale/path.db"},
)
db = ensure_progress_db(colony)
after = _json.loads((colony / "worker.json").read_text())
assert after["input_data"]["custom_key"] == "hello"
assert after["input_data"]["db_path"] == str(db)
assert after["input_data"]["colony_id"] == "preserved"
def test_ensure_progress_db_skips_metadata_and_triggers(tmp_path: Path) -> None:
"""metadata.json and triggers.json are not worker configs — must not be touched."""
import json as _json
colony = tmp_path / "guarded"
colony.mkdir()
(colony / "metadata.json").write_text(_json.dumps({"colony_name": "guarded"}))
(colony / "triggers.json").write_text(_json.dumps([{"id": "t1"}]))
_write_worker_cfg(colony / "worker.json")
ensure_progress_db(colony)
meta = _json.loads((colony / "metadata.json").read_text())
trig = _json.loads((colony / "triggers.json").read_text())
assert "input_data" not in meta
assert trig == [{"id": "t1"}]
worker = _json.loads((colony / "worker.json").read_text())
assert "input_data" in worker
def test_task_delete_cascades_to_steps_and_sop(tmp_path: Path) -> None:
db = ensure_progress_db(tmp_path / "c")
[tid] = seed_tasks(
db,
[
{
"goal": "cascade test",
"steps": [{"title": "a"}, {"title": "b"}],
"sop_items": [{"key": "k", "description": "d"}],
}
],
)
con = sqlite3.connect(str(db), isolation_level=None)
try:
con.execute("PRAGMA foreign_keys = ON")
con.execute("DELETE FROM tasks WHERE id=?", (tid,))
assert con.execute(
"SELECT count(*) FROM steps WHERE task_id=?", (tid,)
).fetchone()[0] == 0
assert con.execute(
"SELECT count(*) FROM sop_checklist WHERE task_id=?", (tid,)
).fetchone()[0] == 0
finally:
con.close()
+1 -1
View File
@@ -141,7 +141,7 @@ class TestSkillDiscovery:
framework_skills = [s for s in skills if s.source_scope == "framework"]
names = {s.name for s in framework_skills}
assert "hive.note-taking" in names
assert "hive.batch-ledger" in names
assert "hive.colony-progress-tracker" in names
def test_max_depth_limit(self, tmp_path):
# Create a skill nested beyond max_depth
+42
View File
@@ -271,6 +271,48 @@ else
exit 1
fi
# Check for sqlite3 CLI (required for colony progress tracking)
echo -n " Checking for sqlite3... "
if command -v sqlite3 &> /dev/null; then
echo -e "${GREEN}ok${NC}"
else
echo -e "${YELLOW}not found${NC}"
# Attempt auto-install on common package managers
SQLITE_INSTALLED=false
if command -v apt-get &> /dev/null; then
echo -n " Installing sqlite3 via apt... "
if sudo apt-get install -y sqlite3 > /dev/null 2>&1; then
SQLITE_INSTALLED=true
fi
elif command -v brew &> /dev/null; then
echo -n " Installing sqlite3 via brew... "
if brew install sqlite > /dev/null 2>&1; then
SQLITE_INSTALLED=true
fi
elif command -v apk &> /dev/null; then
echo -n " Installing sqlite3 via apk... "
if apk add sqlite > /dev/null 2>&1; then
SQLITE_INSTALLED=true
fi
elif command -v dnf &> /dev/null; then
echo -n " Installing sqlite3 via dnf... "
if sudo dnf install -y sqlite > /dev/null 2>&1; then
SQLITE_INSTALLED=true
fi
elif command -v pacman &> /dev/null; then
echo -n " Installing sqlite3 via pacman... "
if sudo pacman -S --noconfirm sqlite > /dev/null 2>&1; then
SQLITE_INSTALLED=true
fi
fi
if [ "$SQLITE_INSTALLED" = true ]; then
echo -e "${GREEN}ok${NC}"
else
echo -e "${YELLOW} ⚠ Could not install sqlite3 automatically${NC}"
echo -e "${DIM} Install manually: apt install sqlite3 / brew install sqlite / apk add sqlite${NC}"
fi
fi
# Check for Chrome/Edge (required for GCU browser tools)
echo -n " Checking for Chrome/Edge browser... "
# Check common browser locations
-132
View File
@@ -1,132 +0,0 @@
---
name: linkedin-connection-greeter
description: Automates accepting LinkedIn connections and sending a welcome message about the HoneyComb prediction market. Handles shadow DOM and Lexical editors.
---
# LinkedIn Connection Greeter
This skill outlines the exact flow to accept connection requests and send a specific welcome message without triggering spam filters.
## 1. Load Ledger
Before starting, read `data/linkedin_contacts.json`. If it doesn't exist, initialize with `{"contacts": []}`. You will use this to skip people you've already messaged.
## 2. Scan Pending Connections
Navigate to `https://www.linkedin.com/mynetwork/invitation-manager/received/`. Wait until load + sleep 4s.
Strip unload handlers:
`browser_evaluate("(function(){window.onbeforeunload=null;})()")`
Extract cards using this specific snippet (handles changing classes and follow invites):
```javascript
(function(){
const btns = Array.from(document.querySelectorAll('button')).filter(b => b.textContent.includes('Accept'));
let results = [];
for (let b of btns) {
let card = b.closest('[role="listitem"]');
if (!card) continue;
let text = card.textContent.toLowerCase();
if (text.includes('invited you to follow') || text.includes('invited you to subscribe')) continue;
let nameEls = Array.from(card.querySelectorAll('a[href*="/in/"]'));
let nameEl = nameEls.find(el => el.textContent.trim().length > 0);
let r = b.getBoundingClientRect();
results.push({
first_name: nameEl ? nameEl.textContent.trim().split(/\s+/)[0] : 'there',
profile_url: nameEl ? nameEl.href : '',
cx: r.x + r.width/2,
cy: r.y + r.height/2
});
}
return results;
})();
```
## 3. Process Each Card (Max 10 per run)
For each card, check if `profile_url` is already in the ledger. If not:
1. `browser_click_coordinate(cx, cy)` to click the specific Accept button.
2. `sleep(2)`
3. `browser_navigate(profile_url, wait_until="load")`
4. `sleep(4)`
5. `browser_evaluate("(function(){window.onbeforeunload=null; window.addEventListener('beforeunload', e => e.stopImmediatePropagation(), true);})()")`
## 4. Message the User
Click Message Button on their profile:
```javascript
(function(){
const links = Array.from(document.querySelectorAll('a[href*="/messaging/compose/"]'));
for (const a of links){
if (!a.href.includes('NON_SELF_PROFILE_VIEW') || a.href.includes('body=')) continue;
const r = a.getBoundingClientRect();
if (r.width === 0 || r.x > 700) continue;
return {cx: r.x + r.width / 2, cy: r.y + r.height / 2};
}
return null;
})();
```
Click that coordinate, then `sleep(2.5)`.
Find Textarea (it is hidden inside shadow DOM):
```javascript
(function(){
const vh = window.innerHeight, vw = window.innerWidth;
const candidates = [];
function walk(root){
const els = root.querySelectorAll ? root.querySelectorAll('div.msg-form__contenteditable') : [];
for (const el of els){
const r = el.getBoundingClientRect();
if (r.width > 0 && r.height > 0 && r.y >= 0 && r.y + r.height <= vh && r.x >= 0 && r.x + r.width <= vw) {
candidates.push({cx: r.x + r.width/2, cy: r.y + r.height/2, area: r.width * r.height});
}
}
const all = root.querySelectorAll ? root.querySelectorAll('*') : [];
for (const host of all){ if (host.shadowRoot) walk(host.shadowRoot); }
}
walk(document);
candidates.sort((a, b) => b.area - a.area);
return candidates.length ? candidates[0] : null;
})();
```
Click that coordinate, `sleep(1)`.
Type the message:
Construct the message: `Hey {first_name}, thanks for the connection invite! I'm currently building a prediction market for jobs: https://honeycomb.open-hive.com/. If you could check it out and share some feedback, I'd really appreciate it.`
Use `browser_type_focused` — it dispatches CDP `Input.insertText` to the already-focused composer (document.activeElement), which works through shadow DOM without JSON-escaping issues:
```
browser_type_focused(text=message_text)
sleep(1.0)
```
Find Send button (also inside shadow DOM):
```javascript
(function(){
const vh = window.innerHeight;
function walk(root){
const btns = root.querySelectorAll ? root.querySelectorAll('button') : [];
for (const b of btns){
const cls = (b.className || '').toString();
if (!cls.includes('send-button') && b.textContent.trim() !== 'Send') continue;
const r = b.getBoundingClientRect();
if (r.width <= 0 || r.y + r.height > vh) continue;
return { cx: r.x + r.width/2, cy: r.y + r.height/2, disabled: b.disabled || b.getAttribute('aria-disabled') === 'true' };
}
const all = root.querySelectorAll ? root.querySelectorAll('*') : [];
for (const host of all){ if (host.shadowRoot) { const got = walk(host.shadowRoot); if (got) return got; } }
return null;
}
return walk(document);
})();
```
Click send coordinate, `sleep(2)`.
## 5. Update Ledger
Append the user to `data/linkedin_contacts.json`.
```json
{
"profile_url": "...",
"name": "...",
"action": "connection_accepted+message_sent",
"timestamp": "2026-..."
}
```
`sleep(5)` before moving to the next card to mimic human pacing.
+121 -7
View File
@@ -82,10 +82,29 @@ def _find_project_root() -> str:
return os.path.dirname(os.path.abspath(__file__))
def _resolve_path(path: str) -> str:
"""Resolve path relative to PROJECT_ROOT. Raises ValueError if outside.
# When ``--write-root`` is passed on the CLI, ``WRITE_ROOT`` diverges
# from ``PROJECT_ROOT``: reads stay permissive (so the queen can
# reference framework skills, docs, and the hive repo), but writes
# are confined to the write root plus the ``~/.hive/`` escape hatch.
# Without this split, the coder-tools sandbox IS the hive git
# checkout — every queen-authored skill/ledger/script lands there as
# untracked debris, which was the 2026-04-15 incident
# (``~/aden/hive/x-rapid-reply/`` and siblings).
WRITE_ROOT: str = ""
Also allows access to ~/.hive/ directory for agent session data files.
def _resolve_read_path(path: str) -> str:
"""Resolve path for READ operations.
Allowlist (in order):
1. Paths under ``~/.hive/`` agent session data, colonies, skills.
2. Paths under ``PROJECT_ROOT`` hive repo, for reading framework
defaults, docs, examples, etc.
3. Relative paths joined against ``PROJECT_ROOT`` (read-side
default; writes use ``WRITE_ROOT`` instead).
Raises ``ValueError`` when the resolved path falls outside all
allowed roots.
"""
# Normalize slashes for cross-platform (e.g. exports/hi_agent from LLM)
path = path.replace("/", os.sep)
@@ -153,6 +172,88 @@ def _resolve_path(path: str) -> str:
return resolved
def _resolve_write_path(path: str) -> str:
"""Resolve path for WRITE operations.
Stricter than the read resolver: only allows writes under:
1. ``WRITE_ROOT`` the agent workspace (default: ``~/.hive/workspace/``
when ``--write-root`` is passed).
2. ``~/.hive/`` agent session data.
Writes to the hive repo (``PROJECT_ROOT``) are REJECTED to keep
the git checkout clean of queen-authored debris. Relative paths
resolve against ``WRITE_ROOT``, not ``PROJECT_ROOT``.
When ``WRITE_ROOT`` equals ``PROJECT_ROOT`` (no split configured),
this function is semantically identical to ``_resolve_read_path``.
"""
# Normalize slashes + expand ~
path = path.replace("/", os.sep)
if path.startswith("~"):
path = os.path.expanduser(path)
hive_dir = os.path.expanduser("~/.hive")
if os.path.isabs(path):
resolved = os.path.abspath(path)
# Always allow writes under ~/.hive/
try:
if os.path.commonpath([resolved, hive_dir]) == hive_dir:
return resolved
except ValueError:
pass
# Writes are ALSO allowed under WRITE_ROOT (the agent workspace).
try:
if os.path.commonpath([resolved, WRITE_ROOT]) == WRITE_ROOT:
return resolved
except ValueError:
pass
# If WRITE_ROOT == PROJECT_ROOT (legacy behavior: no split),
# fall through to the read-side resolver so existing callers
# keep working unchanged.
if WRITE_ROOT == PROJECT_ROOT:
return _resolve_read_path(path)
# Split configured AND the path isn't under WRITE_ROOT or
# ~/.hive/. Reject — this is the whole point of the split.
raise ValueError(
f"Access denied: writes must be under '{WRITE_ROOT}' or "
f"'{hive_dir}'. Path '{path}' is outside both "
"(use an absolute path under one of those roots, or a "
"relative path which will resolve under the write root)."
)
else:
# Relative path: resolve against WRITE_ROOT, not PROJECT_ROOT.
resolved = os.path.abspath(os.path.join(WRITE_ROOT, path))
# Double-check the resolved absolute path is inside WRITE_ROOT or
# ~/.hive/ (covers edge cases like "../../etc/passwd" that escape).
try:
wr_common = os.path.commonpath([resolved, WRITE_ROOT])
except ValueError:
wr_common = ""
try:
hv_common = os.path.commonpath([resolved, hive_dir])
except ValueError:
hv_common = ""
if wr_common != WRITE_ROOT and hv_common != hive_dir:
raise ValueError(
f"Access denied: resolved write path '{resolved}' escaped the "
f"allowed roots ('{WRITE_ROOT}', '{hive_dir}')."
)
return resolved
# Back-compat alias: existing call sites in this module call
# ``_resolve_path`` directly (e.g. for snapshot dirs, agent tool
# introspection). Those are all non-user-driven paths; route them
# through the read resolver.
_resolve_path = _resolve_read_path
# ── Git snapshot system (ported from opencode's shadow git) ───────────────
@@ -1637,32 +1738,45 @@ def validate_agent_package(agent_name: str) -> str:
def main() -> None:
global PROJECT_ROOT, SNAPSHOT_DIR
global PROJECT_ROOT, SNAPSHOT_DIR, WRITE_ROOT
from aden_tools.file_ops import register_file_tools
parser = argparse.ArgumentParser(description="Coder Tools MCP Server")
parser.add_argument("--project-root", default="")
# ``--write-root`` isolates file writes from the project root so
# queen-authored skills, ledgers, and scripts don't land in the
# hive git checkout. Reads remain permissive under PROJECT_ROOT
# so framework skills, docs, and examples stay accessible.
# Defaults to PROJECT_ROOT when empty (legacy behavior).
parser.add_argument("--write-root", default="")
parser.add_argument("--port", type=int, default=int(os.getenv("CODER_TOOLS_PORT", "4002")))
parser.add_argument("--host", default="0.0.0.0")
parser.add_argument("--stdio", action="store_true")
args = parser.parse_args()
PROJECT_ROOT = os.path.abspath(args.project_root) if args.project_root else _find_project_root()
if args.write_root:
WRITE_ROOT = os.path.abspath(os.path.expanduser(args.write_root))
os.makedirs(WRITE_ROOT, exist_ok=True)
else:
WRITE_ROOT = PROJECT_ROOT # legacy: no split
SNAPSHOT_DIR = os.path.join(
os.path.expanduser("~"),
".hive",
"snapshots",
os.path.basename(PROJECT_ROOT),
)
logger.info(f"Project root: {PROJECT_ROOT}")
logger.info(f"Project root (reads): {PROJECT_ROOT}")
logger.info(f"Write root (writes): {WRITE_ROOT}")
logger.info(f"Snapshot dir: {SNAPSHOT_DIR}")
register_file_tools(
mcp,
resolve_path=_resolve_path,
resolve_path=_resolve_read_path,
resolve_path_write=_resolve_write_path,
before_write=None, # Git snapshot causes stdio deadlock on Windows; undo_changes limited
project_root=PROJECT_ROOT,
project_root=WRITE_ROOT,
)
if args.stdio:
+12 -5
View File
@@ -328,6 +328,7 @@ def register_file_tools(
mcp: FastMCP,
*,
resolve_path: Callable[[str], str] | None = None,
resolve_path_write: Callable[[str], str] | None = None,
before_write: Callable[[], None] | None = None,
project_root: str | None = None,
) -> None:
@@ -335,12 +336,18 @@ def register_file_tools(
Args:
mcp: FastMCP instance to register tools on.
resolve_path: Path resolver. Default: resolve to absolute path.
Raise ValueError to reject paths (e.g. outside sandbox).
resolve_path: Path resolver for READ operations. Default:
resolve to absolute path. Raise ValueError to reject paths
(e.g. outside sandbox).
resolve_path_write: Path resolver for WRITE/EDIT operations.
Defaults to ``resolve_path`` when not provided. Split
resolvers let callers keep reads permissive (framework
skills, docs) while confining writes to an agent workspace.
before_write: Hook called before write/edit operations (e.g. git snapshot).
project_root: If set, search_files relativizes output paths to this root.
"""
_resolve = resolve_path or _default_resolve_path
_resolve_write = resolve_path_write or _resolve
@mcp.tool()
def read_file(path: str, offset: int = 1, limit: int = 0, hashline: bool = False) -> str:
@@ -440,7 +447,7 @@ def register_file_tools(
path: Absolute file path to write.
content: Complete file content to write.
"""
resolved = _resolve(path)
resolved = _resolve_write(path)
resolved_path = Path(resolved)
# Stale-edit guard: an existing file must have been read recently
@@ -509,7 +516,7 @@ def register_file_tools(
new_text: Replacement text.
replace_all: Replace all occurrences (default: first only).
"""
resolved = _resolve(path)
resolved = _resolve_write(path)
if not os.path.isfile(resolved):
return f"Error: File not found: {path}"
@@ -815,7 +822,7 @@ def register_file_tools(
return "Error: Too many edits in one call (max 100). Split into multiple calls."
# 2. Read file
resolved = _resolve(path)
resolved = _resolve_write(path)
if not os.path.isfile(resolved):
return f"Error: File not found: {path}"
+47 -3
View File
@@ -96,15 +96,59 @@ def register_advanced_tools(mcp: FastMCP) -> None:
profile: str | None = None,
) -> dict:
"""
Execute JavaScript in the browser context.
ESCAPE HATCH execute raw JavaScript. USE ONLY as a last
resort. 99% of browser automation does NOT need this tool.
Before reaching for it, try a semantic tool first:
- browser_click / browser_click_coordinate for clicks
- browser_type(use_insert_text=True) for text input
- browser_screenshot + browser_get_rect for locating elements
- browser_shadow_query for shadow-DOM selectors
- browser_get_text / browser_get_attribute for reading state
ANTI-PATTERNS stop and switch tools if you notice yourself:
1. Calling browser_evaluate 2+ times in a row to guess at
selectors. Each attempt costs ~30 tokens of JS + a full
LLM round-trip. After 2 empty results, the selector
strategy is wrong pivot to browser_screenshot +
browser_click_coordinate. The screenshot + coord path
works on shadow DOM, iframes, and React-obfuscated
class names indifferently.
2. Writing a walk(root) recursive shadow-DOM traversal
function. Use browser_shadow_query it does the
traversal in C++ via CDP's querySelector, not in JS.
3. Calling document.execCommand('insertText', ...) to type
into Lexical / contenteditable. Use
browser_type(use_insert_text=True, text='...') instead.
It handles the click-then-focus-then-insert sequence
with built-in retries.
4. Trying to read a nested iframe's contentDocument. That
usually fails (cross-origin or late hydration). Use
browser_screenshot to see it, then browser_click_coordinate.
LEGITIMATE uses (when nothing semantic fits):
- Reading a computed style, window size, or scroll position
that no tool exposes.
- Firing a one-shot site-specific API call (e.g. an analytics
beacon the test needs).
- Stripping an onbeforeunload handler that blocks navigation.
- Probing for shadow roots whose existence is conditional.
Args:
script: JavaScript code to execute
script: JavaScript code to execute. Keep it small. If you
need to traverse the DOM, prefer browser_shadow_query.
tab_id: Chrome tab ID (default: active tab)
profile: Browser profile name (default: "default")
Returns:
Dict with evaluation result
Dict with evaluation result. On a "find X" script that
returns [] or null: do NOT retry with a different
selector take a screenshot and switch to coordinates.
"""
bridge = get_bridge()
if not bridge or not bridge.is_connected:
+15
View File
@@ -0,0 +1,15 @@
import json
try:
with open('data/linkedin_ledger.json', 'r') as f:
data = json.load(f)
profiles = data.get('messaged_profiles', [])
for p in profiles:
if 'variant' not in p:
p['variant'] = 'Control' # Retroactively label our first runs
with open('data/linkedin_ledger.json', 'w') as f:
json.dump({"messaged_profiles": profiles}, f, indent=2)
except Exception as e:
print(f"Error: {e}")
+16
View File
@@ -0,0 +1,16 @@
{
"replies": [
{
"original_preview": "NASA Ames@NASAAmes\u00b75hWe\u2019re just getting started\n\nDuring their historic journey around the Moon, Artemis II observed lunar targets to study color, text"
},
{
"original_preview": "NASA Marshall@NASA_Marshall\u00b74h Enjoy these views of the Artemis II launch from cameras affixed to the rocket! On April 1, 2026, the SLS (Space Launch "
},
{
"original_preview": "U.S. Navy@USNavy\u00b711hFirst contact. On April 10, U.S. Navy divers were the first on the scene as the Navy and NASA successfully recovered the Orion s"
},
{
"original_preview": "Alright, I give in. Here\u2019s my picture with the boss, courtesy of @johnkrausphotos. Oh, and hook \u2018em!"
}
]
}