Compare commits
144 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| fe74718fd9 | |||
| 07c97e2e9b | |||
| 07600c5ab5 | |||
| e7d4ce0057 | |||
| d9813288d9 | |||
| 41fbdcb940 | |||
| 4a9b22719b | |||
| 8cb0531959 | |||
| feabf32768 | |||
| eee55ea8c7 | |||
| 78fffa63ec | |||
| 9a75d45351 | |||
| 3a94f52009 | |||
| 522e0f511e | |||
| e6310f1243 | |||
| 12ffacccab | |||
| 8c36b1575c | |||
| 6540f7b31e | |||
| a09eac06f1 | |||
| b939a875a7 | |||
| b826e70d8c | |||
| 6f2f037c9c | |||
| c147364d8c | |||
| 35bd497750 | |||
| 574c4bbe33 | |||
| d22a01682a | |||
| 0c6f0f8aef | |||
| 0e8efa7bcc | |||
| 7b1dda7bf3 | |||
| 725dd1f410 | |||
| de4b2dc151 | |||
| 0784cea314 | |||
| 20bbf08278 | |||
| f8233bda56 | |||
| 76a7dd4bd5 | |||
| 73511a3c59 | |||
| a0817fcde4 | |||
| 628ce9ca12 | |||
| cc4213a942 | |||
| d12d5b7e8b | |||
| 038c5fd807 | |||
| 3d5f2595c9 | |||
| 7881177f1f | |||
| 2cfea915f4 | |||
| ac46a1be72 | |||
| 7b0b472167 | |||
| 697aae33fe | |||
| d26e7f33d2 | |||
| 6357597e88 | |||
| 579f1d7512 | |||
| 965264c973 | |||
| e80d275321 | |||
| 5b45fac435 | |||
| 4794c8b816 | |||
| 5492366c31 | |||
| ae2aa30edf | |||
| dd69a53de1 | |||
| 062a4e3166 | |||
| fe9a903928 | |||
| 7c3bada70c | |||
| 4ef951447d | |||
| ccb6556a41 | |||
| 5ca5021fc1 | |||
| 9eeba74851 | |||
| facd919371 | |||
| cb1484be85 | |||
| 82ce6bed68 | |||
| efdb404655 | |||
| da361f735d | |||
| eea0429f93 | |||
| 833aa4bc7a | |||
| 0af597881f | |||
| 6fae1f04c8 | |||
| 8c4085f5e8 | |||
| 53240eb888 | |||
| de8d6f0946 | |||
| ea707438f2 | |||
| 445c9600ab | |||
| 2ab5e6d784 | |||
| e7f9b7d791 | |||
| 3cb0c69a96 | |||
| 22d75bfb05 | |||
| 357df1bbcb | |||
| 386bbd5780 | |||
| 235022b35d | |||
| 4d8f312c3e | |||
| 4651a6a85a | |||
| ea9c163438 | |||
| 77cc169606 | |||
| 8c6428f445 | |||
| 44cb0c0f4c | |||
| 2621fb88b1 | |||
| a70f92edbe | |||
| b2efa179ea | |||
| 8c6e76d052 | |||
| c7f1fbf19f | |||
| 7047ecbf46 | |||
| b96ee5aaab | |||
| 6744bea01a | |||
| 390038225b | |||
| b55c8fdf86 | |||
| e9aea0bbc4 | |||
| 0ba1fa8262 | |||
| 0fd96d410e | |||
| c658a7c50b | |||
| 56c3659bda | |||
| 14f927996c | |||
| 8a0ec070b8 | |||
| 80cd77ac30 | |||
| c67521a09c | |||
| 8da06f4f90 | |||
| 46e0413eb8 | |||
| 81731587ff | |||
| 4e9d9bf1ea | |||
| 2644ab953d | |||
| e7daa59573 | |||
| 1bec43afad | |||
| 3d1357595d | |||
| 59ccbba810 | |||
| 8b2ae369ac | |||
| 96a667cbd9 | |||
| 17150a53bd | |||
| c1d7b0ee69 | |||
| 16ea9b52d3 | |||
| dcbfd4ab01 | |||
| b762020793 | |||
| 4ffddc53e6 | |||
| 24bcc5aea7 | |||
| 3c91119f67 | |||
| 923e773c14 | |||
| 199c3a235e | |||
| a881fe68da | |||
| 6b9040477f | |||
| c7cc031060 | |||
| 93c0ef672a | |||
| 67d55e6cce | |||
| 0907ff9cec | |||
| ed2e7125ac | |||
| f39c1c87af | |||
| 1229b4ad4d | |||
| 0d11a946a5 | |||
| b007ed753b | |||
| bb39424e99 | |||
| b27c7a029e |
@@ -47,7 +47,6 @@
|
||||
"Bash(grep -v ':0$')",
|
||||
"Bash(curl -s -m 2 http://127.0.0.1:4002/sse -o /dev/null -w 'status=%{http_code} time=%{time_total}s\\\\n')",
|
||||
"mcp__gcu-tools__browser_status",
|
||||
"mcp__gcu-tools__browser_start",
|
||||
"mcp__gcu-tools__browser_navigate",
|
||||
"mcp__gcu-tools__browser_evaluate",
|
||||
"mcp__gcu-tools__browser_screenshot",
|
||||
|
||||
@@ -214,7 +214,7 @@ Curated list of known browser automation edge cases with symptoms, causes, and f
|
||||
| **Symptom** | `browser_open()` returns `"No group with id: XXXXXXX"` even though `browser_status` shows `running: true` |
|
||||
| **Root Cause** | In-memory `_contexts` dict has a stale `groupId` from a Chrome tab group that was closed outside the tool (e.g. user closed the tab group) |
|
||||
| **Detection** | `browser_status` returns `running: true` but `browser_open` fails with "No group with id" |
|
||||
| **Fix** | Call `browser_stop()` to clear stale context from `_contexts`, then `browser_start()` again |
|
||||
| **Fix** | Call `browser_stop()` to clear stale context from `_contexts`, then `browser_open(url)` to lazy-create a fresh one |
|
||||
| **Code** | `tools/lifecycle.py:144-160` - `already_running` check uses cached dict without validating against Chrome |
|
||||
| **Verified** | 2026-04-03 ✓ |
|
||||
|
||||
|
||||
@@ -84,11 +84,23 @@ jobs:
|
||||
with:
|
||||
enable-cache: true
|
||||
|
||||
- name: Install dependencies and run tests
|
||||
- name: Install dependencies
|
||||
working-directory: tools
|
||||
run: |
|
||||
uv sync --extra dev
|
||||
uv run pytest tests/ -v
|
||||
run: uv sync --extra dev
|
||||
|
||||
- name: Install Playwright Chromium (Linux)
|
||||
if: runner.os == 'Linux'
|
||||
working-directory: tools
|
||||
run: uv run playwright install --with-deps chromium
|
||||
|
||||
- name: Install Playwright Chromium (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
working-directory: tools
|
||||
run: uv run playwright install chromium
|
||||
|
||||
- name: Run tests
|
||||
working-directory: tools
|
||||
run: uv run pytest tests/ -v
|
||||
|
||||
validate:
|
||||
name: Validate Agent Exports
|
||||
|
||||
+2
-2
@@ -407,7 +407,7 @@ Aden Hive supports **100+ LLM providers** via LiteLLM, giving users maximum flex
|
||||
| **Anthropic** | Claude 3.5 Sonnet, Haiku, Opus | Default provider, best for reasoning |
|
||||
| **OpenAI** | GPT-4, GPT-4 Turbo, GPT-4o | Function calling, vision |
|
||||
| **OpenRouter** | Any OpenRouter catalog model | Uses `OPENROUTER_API_KEY` and `https://openrouter.ai/api/v1` |
|
||||
| **Hive LLM** | `queen`, `kimi-2.5`, `GLM-5` | Uses `HIVE_API_KEY` and the Hive-managed endpoint |
|
||||
| **Hive LLM** | `queen`, `kimi-k2.5`, `GLM-5` | Uses `HIVE_API_KEY` and the Hive-managed endpoint |
|
||||
| **Google** | Gemini 1.5 Pro, Flash | Long context windows |
|
||||
| **DeepSeek** | DeepSeek V3 | Cost-effective, strong reasoning |
|
||||
| **Mistral** | Mistral Large, Medium, Small | Open weights, EU hosting |
|
||||
@@ -435,7 +435,7 @@ DEFAULT_MODEL = "claude-haiku-4-5-20251001"
|
||||
|
||||
**Provider-Specific Notes**
|
||||
- **OpenRouter**: store `provider` as `openrouter`, use the raw OpenRouter model ID in `model` (for example `x-ai/grok-4.20-beta`), and use `OPENROUTER_API_KEY`
|
||||
- **Hive LLM**: store `provider` as `hive`, use Hive model names such as `queen`, `kimi-2.5`, or `GLM-5`, and use `HIVE_API_KEY`
|
||||
- **Hive LLM**: store `provider` as `hive`, use Hive model names such as `queen`, `kimi-k2.5`, or `GLM-5`, and use `HIVE_API_KEY`
|
||||
|
||||
**For Development**
|
||||
- Use cheaper/faster models (Haiku, GPT-4o-mini)
|
||||
|
||||
@@ -72,17 +72,16 @@ Register an MCP server as a tool source for your agent.
|
||||
"cwd": "../tools",
|
||||
"description": "Aden tools..."
|
||||
},
|
||||
"tools_discovered": 6,
|
||||
"tools_discovered": 5,
|
||||
"tools": [
|
||||
"web_search",
|
||||
"web_scrape",
|
||||
"file_read",
|
||||
"file_write",
|
||||
"pdf_read",
|
||||
"example_tool"
|
||||
"pdf_read"
|
||||
],
|
||||
"total_mcp_servers": 1,
|
||||
"note": "MCP server 'tools' registered with 6 tools. These tools can now be used in event_loop nodes."
|
||||
"note": "MCP server 'tools' registered with 5 tools. These tools can now be used in event_loop nodes."
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# MCP Server Guide - Agent Building Tools
|
||||
|
||||
> **Note:** The standalone `agent-builder` MCP server (`framework.mcp.agent_builder_server`) has been replaced. Agent building is now done via the `coder-tools` server's `initialize_and_build_agent` tool, with underlying logic in `tools/coder_tools_server.py`.
|
||||
> **Note:** This document is stale. The previous `coder-tools` MCP server has been replaced by `files-tools` (`tools/files_server.py`), which only exposes file I/O (`read_file`, `write_file`, `edit_file`, `hashline_edit`, `search_files`). The agent-building, shell, and snapshot tools that used to live here have been removed.
|
||||
|
||||
This guide covers the MCP tools available for building goal-driven agents.
|
||||
|
||||
@@ -20,9 +20,9 @@ Add to your MCP client configuration (e.g., Claude Desktop):
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"coder-tools": {
|
||||
"files-tools": {
|
||||
"command": "uv",
|
||||
"args": ["run", "coder_tools_server.py", "--stdio"],
|
||||
"args": ["run", "files_server.py", "--stdio"],
|
||||
"cwd": "/path/to/hive/tools"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,8 +19,6 @@ uv pip install -e .
|
||||
|
||||
## Agent Building
|
||||
|
||||
Agent scaffolding is handled by the `coder-tools` MCP server (in `tools/coder_tools_server.py`), which provides the `initialize_and_build_agent` tool and related utilities. The package generation logic lives directly in `tools/coder_tools_server.py`.
|
||||
|
||||
See the [Getting Started Guide](../docs/getting-started.md) for building agents.
|
||||
|
||||
## Quick Start
|
||||
|
||||
@@ -14,7 +14,6 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
@@ -66,12 +65,12 @@ from framework.agent_loop.internals.stall_detector import (
|
||||
ngram_similarity,
|
||||
)
|
||||
from framework.agent_loop.internals.synthetic_tools import (
|
||||
build_ask_user_multiple_tool,
|
||||
build_ask_user_tool,
|
||||
build_escalate_tool,
|
||||
build_report_to_parent_tool,
|
||||
handle_report_to_parent,
|
||||
)
|
||||
from framework.agent_loop.internals.tool_input_coercer import coerce_tool_input
|
||||
from framework.agent_loop.internals.tool_result_handler import (
|
||||
build_json_preview,
|
||||
execute_tool,
|
||||
@@ -85,7 +84,12 @@ from framework.agent_loop.internals.types import (
|
||||
JudgeVerdict,
|
||||
TriggerEvent,
|
||||
)
|
||||
from framework.agent_loop.internals.vision_fallback import (
|
||||
caption_tool_image,
|
||||
extract_intent_for_tool,
|
||||
)
|
||||
from framework.agent_loop.types import AgentContext, AgentProtocol, AgentResult
|
||||
from framework.config import get_vision_fallback_model
|
||||
from framework.host.event_bus import EventBus
|
||||
from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results
|
||||
from framework.llm.provider import Tool, ToolResult, ToolUse
|
||||
@@ -177,46 +181,58 @@ def _strip_internal_tags_from_snapshot(snapshot: str) -> str:
|
||||
return cleaned
|
||||
|
||||
|
||||
async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> str | None:
|
||||
"""Describe images using the best available vision model."""
|
||||
import litellm
|
||||
def _vision_fallback_active(model: str | None) -> bool:
|
||||
"""Return True if tool-result images for *model* should be routed
|
||||
through the vision-fallback chain rather than sent to the model.
|
||||
|
||||
blocks: list[dict[str, Any]] = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": (
|
||||
"Describe the following image(s) concisely but with enough detail "
|
||||
"that a text-only AI assistant can understand the content and context."
|
||||
),
|
||||
}
|
||||
]
|
||||
blocks.extend(image_content)
|
||||
Trigger: the model's catalog entry has ``supports_vision: false``
|
||||
(resolved via :func:`capabilities.supports_image_tool_results`,
|
||||
which reads ``model_catalog.json``). Unknown models default to
|
||||
vision-capable, so the fallback only fires when the catalog
|
||||
explicitly says the model is text-only.
|
||||
|
||||
candidates: list[str] = []
|
||||
if os.environ.get("OPENAI_API_KEY"):
|
||||
candidates.append("gpt-4o-mini")
|
||||
if os.environ.get("ANTHROPIC_API_KEY"):
|
||||
candidates.append("claude-3-haiku-20240307")
|
||||
if os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY"):
|
||||
candidates.append("gemini/gemini-1.5-flash")
|
||||
The ``vision_fallback`` config block is the *substitution* model —
|
||||
it doesn't widen the trigger. To force fallback for a model that
|
||||
isn't catalogued yet, add an entry to ``model_catalog.json`` with
|
||||
``supports_vision: false`` rather than relying on a runtime config.
|
||||
"""
|
||||
if not model:
|
||||
return False
|
||||
return not supports_image_tool_results(model)
|
||||
|
||||
for model in candidates:
|
||||
try:
|
||||
response = await litellm.acompletion(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": blocks}],
|
||||
max_tokens=512,
|
||||
)
|
||||
description = (response.choices[0].message.content or "").strip()
|
||||
if description:
|
||||
count = len(image_content)
|
||||
label = "image" if count == 1 else f"{count} images"
|
||||
return f"[{label} attached — description: {description}]"
|
||||
except Exception as exc:
|
||||
logger.debug("Vision fallback model '%s' failed: %s", model, exc)
|
||||
continue
|
||||
|
||||
return None
|
||||
async def _captioning_chain(
|
||||
intent: str,
|
||||
image_content: list[dict[str, Any]],
|
||||
) -> tuple[str, str] | None:
|
||||
"""Configured vision_fallback → retry → ``gemini/gemini-3-flash-preview``.
|
||||
|
||||
The Gemini override reuses the configured ``api_key`` / ``api_base``,
|
||||
so a Hive subscriber (whose token routes to a multi-model proxy)
|
||||
keeps coverage when their primary model glitches. Without
|
||||
configured creds litellm falls through to env-based Gemini auth;
|
||||
users with neither Hive nor a ``GEMINI_API_KEY`` simply lose the
|
||||
third try.
|
||||
"""
|
||||
if result := await caption_tool_image(intent, image_content):
|
||||
return result
|
||||
logger.warning("vision_fallback failed; retrying configured model")
|
||||
if result := await caption_tool_image(intent, image_content):
|
||||
return result
|
||||
# Match the configured model's proxy prefix so the override is routed
|
||||
# through the same endpoint with the same auth shape. Without this,
|
||||
# a Hive subscriber's `hive/...` config would override to
|
||||
# `gemini/...` — which sends Google's Gemini protocol to the
|
||||
# Anthropic-compatible Hive proxy (404), not what we want.
|
||||
configured = (get_vision_fallback_model() or "").lower()
|
||||
if configured.startswith("hive/"):
|
||||
override = "hive/gemini-3-flash-preview"
|
||||
elif configured.startswith("kimi/"):
|
||||
override = "kimi/gemini-3-flash-preview"
|
||||
else:
|
||||
override = "gemini/gemini-3-flash-preview"
|
||||
logger.warning("vision_fallback retry failed; trying %s", override)
|
||||
return await caption_tool_image(intent, image_content, model_override=override)
|
||||
|
||||
|
||||
# Pattern for detecting context-window-exceeded errors across LLM providers.
|
||||
@@ -376,6 +392,14 @@ class AgentLoop(AgentProtocol):
|
||||
# dashboards can build aggregates over many runs.
|
||||
self._counters: dict[str, int] = {}
|
||||
|
||||
# Task-system reminder state (see framework/tasks/reminders.py).
|
||||
# Bumped each iteration; reset whenever a task op tool was called
|
||||
# in the iteration that just completed; nudges the agent via the
|
||||
# injection queue when it's been silent on tasks for too long.
|
||||
from framework.tasks.reminders import ReminderState as _RS
|
||||
|
||||
self._task_reminder_state: _RS = _RS()
|
||||
|
||||
def _bump(self, key: str, by: int = 1) -> None:
|
||||
"""Increment a reliability counter (creates the key on first use)."""
|
||||
self._counters[key] = self._counters.get(key, 0) + by
|
||||
@@ -575,6 +599,7 @@ class AgentLoop(AgentProtocol):
|
||||
store=self._conversation_store,
|
||||
run_id=ctx.effective_run_id,
|
||||
compaction_buffer_tokens=self._config.compaction_buffer_tokens,
|
||||
compaction_buffer_ratio=self._config.compaction_buffer_ratio,
|
||||
compaction_warning_buffer_tokens=(self._config.compaction_warning_buffer_tokens),
|
||||
)
|
||||
accumulator = OutputAccumulator(
|
||||
@@ -587,7 +612,12 @@ class AgentLoop(AgentProtocol):
|
||||
|
||||
initial_message = self._build_initial_message(ctx)
|
||||
if initial_message:
|
||||
await conversation.add_user_message(initial_message)
|
||||
# Stamp with arrival time so the conversation has a
|
||||
# temporal anchor for the first turn, matching the
|
||||
# stamping done by drain_injection_queue for every
|
||||
# subsequent event.
|
||||
_stamp = datetime.now().astimezone().strftime("%Y-%m-%d %H:%M %Z")
|
||||
await conversation.add_user_message(f"[{_stamp}] {initial_message}")
|
||||
|
||||
await self._run_hooks("session_start", conversation, trigger=initial_message)
|
||||
|
||||
@@ -599,7 +629,8 @@ class AgentLoop(AgentProtocol):
|
||||
initial_message = self._build_initial_message(ctx)
|
||||
if not initial_message:
|
||||
initial_message = "Hello"
|
||||
await conversation.add_user_message(initial_message)
|
||||
_stamp = datetime.now().astimezone().strftime("%Y-%m-%d %H:%M %Z")
|
||||
await conversation.add_user_message(f"[{_stamp}] {initial_message}")
|
||||
|
||||
# 2b. Restore spill counter from existing files (resume safety)
|
||||
self._restore_spill_counter()
|
||||
@@ -608,8 +639,6 @@ class AgentLoop(AgentProtocol):
|
||||
tools = list(ctx.available_tools)
|
||||
if ctx.supports_direct_user_io:
|
||||
tools.append(self._build_ask_user_tool())
|
||||
if stream_id == "queen" or stream_id == "overseer":
|
||||
tools.append(self._build_ask_user_multiple_tool())
|
||||
# Workers (parallel ephemeral agents) get escalate + report_to_parent.
|
||||
# The overseer is client-facing like the queen and has neither.
|
||||
if stream_id not in ("queen", "judge", "overseer"):
|
||||
@@ -621,8 +650,23 @@ class AgentLoop(AgentProtocol):
|
||||
# Hide image-producing tools from text-only models so they never try
|
||||
# to call them. Avoids wasted turns + "screenshot failed" lessons
|
||||
# getting saved to memory. See framework.llm.capabilities.
|
||||
# EXCEPTION: when the model IS on the text-only deny list AND
|
||||
# a vision_fallback subagent is configured, leave image tools
|
||||
# visible. The post-execution hook in the inner tool loop
|
||||
# will route each image_content through the fallback VLM and
|
||||
# replace it with a text caption before the main agent sees
|
||||
# the result — so the main agent gets captions instead of
|
||||
# raw images, rather than losing the tool entirely. We DON'T
|
||||
# bypass the filter for vision-capable models (that would be
|
||||
# a no-op anyway — the filter doesn't fire for them) and we
|
||||
# DON'T bypass it without a configured fallback (the agent
|
||||
# would just see raw stripped tool results with no caption).
|
||||
_llm_model = ctx.llm.model if ctx.llm else ""
|
||||
tools, _hidden_image_tools = filter_tools_for_model(tools, _llm_model)
|
||||
_text_only_main = _llm_model and not supports_image_tool_results(_llm_model)
|
||||
if _text_only_main and get_vision_fallback_model() is not None:
|
||||
_hidden_image_tools: list[str] = []
|
||||
else:
|
||||
tools, _hidden_image_tools = filter_tools_for_model(tools, _llm_model)
|
||||
|
||||
logger.info(
|
||||
"[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s | hidden_image_tools=%s",
|
||||
@@ -754,8 +798,6 @@ class AgentLoop(AgentProtocol):
|
||||
)
|
||||
got_input = await self._await_user_input(
|
||||
ctx,
|
||||
prompt=str(pending_input_state.get("prompt", "")),
|
||||
options=pending_input_state.get("options"),
|
||||
questions=pending_input_state.get("questions"),
|
||||
emit_client_request=bool(pending_input_state.get("emit_client_request", True)),
|
||||
)
|
||||
@@ -789,7 +831,6 @@ class AgentLoop(AgentProtocol):
|
||||
if ctx.dynamic_tools_provider is not None:
|
||||
_synthetic_names = {
|
||||
"ask_user",
|
||||
"ask_user_multiple",
|
||||
"escalate",
|
||||
}
|
||||
synthetic = [t for t in tools if t.name in _synthetic_names]
|
||||
@@ -798,14 +839,56 @@ class AgentLoop(AgentProtocol):
|
||||
tools.extend(synthetic)
|
||||
|
||||
# 6b3. Dynamic prompt refresh (phase switching / memory refresh)
|
||||
if ctx.dynamic_prompt_provider is not None or ctx.dynamic_memory_provider is not None:
|
||||
if (
|
||||
ctx.dynamic_prompt_provider is not None
|
||||
or ctx.dynamic_memory_provider is not None
|
||||
or ctx.dynamic_skills_catalog_provider is not None
|
||||
):
|
||||
if ctx.dynamic_prompt_provider is not None:
|
||||
_new_prompt = stamp_prompt_datetime(ctx.dynamic_prompt_provider())
|
||||
_new_prompt = ctx.dynamic_prompt_provider()
|
||||
# When a suffix provider is also wired (Queen's
|
||||
# static/dynamic split), keep the two pieces separate
|
||||
# so the LLM wrapper can emit them as two system
|
||||
# content blocks with a cache breakpoint between them.
|
||||
# The timestamp used to be stamped here via
|
||||
# stamp_prompt_datetime on every iteration — it now
|
||||
# lives inside the frozen dynamic suffix and is only
|
||||
# refreshed at user-turn boundaries, so per-iteration
|
||||
# stamping would both double-stamp and bust the cache.
|
||||
_new_suffix: str | None = None
|
||||
if ctx.dynamic_prompt_suffix_provider is not None:
|
||||
try:
|
||||
_new_suffix = ctx.dynamic_prompt_suffix_provider() or ""
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"[%s] dynamic_prompt_suffix_provider raised — falling back to legacy stamp",
|
||||
node_id,
|
||||
exc_info=True,
|
||||
)
|
||||
_new_suffix = None
|
||||
if _new_suffix is None:
|
||||
# Legacy / fallback path: no split in use (or the
|
||||
# suffix provider raised). Stamp the timestamp at
|
||||
# the end of the single-string prompt so the model
|
||||
# still sees a current "now".
|
||||
_new_prompt = stamp_prompt_datetime(_new_prompt)
|
||||
else:
|
||||
# build_system_prompt_for_context reads dynamic_skills_catalog_provider
|
||||
# directly; no separate branch needed.
|
||||
_new_prompt = build_system_prompt_for_context(ctx)
|
||||
if _new_prompt != conversation.system_prompt:
|
||||
conversation.update_system_prompt(_new_prompt)
|
||||
logger.info("[%s] Dynamic prompt updated", node_id)
|
||||
_new_suffix = None
|
||||
if _new_suffix is not None:
|
||||
_combined_for_compare = f"{_new_prompt}\n\n{_new_suffix}" if _new_suffix else _new_prompt
|
||||
if (
|
||||
_combined_for_compare != conversation.system_prompt
|
||||
or _new_suffix != conversation.system_prompt_dynamic_suffix
|
||||
):
|
||||
conversation.update_system_prompt(_new_prompt, dynamic_suffix=_new_suffix)
|
||||
logger.info("[%s] Dynamic prompt updated (split)", node_id)
|
||||
else:
|
||||
if _new_prompt != conversation.system_prompt:
|
||||
conversation.update_system_prompt(_new_prompt)
|
||||
logger.info("[%s] Dynamic prompt updated", node_id)
|
||||
|
||||
# 6c. Publish iteration event (with per-iteration metadata when available)
|
||||
_iter_meta = None
|
||||
@@ -863,8 +946,6 @@ class AgentLoop(AgentProtocol):
|
||||
turn_tokens,
|
||||
logged_tool_calls,
|
||||
user_input_requested,
|
||||
ask_user_prompt,
|
||||
ask_user_options,
|
||||
queen_input_requested,
|
||||
request_system_prompt,
|
||||
request_messages,
|
||||
@@ -889,6 +970,17 @@ class AgentLoop(AgentProtocol):
|
||||
)
|
||||
total_input_tokens += turn_tokens.get("input", 0)
|
||||
total_output_tokens += turn_tokens.get("output", 0)
|
||||
|
||||
# Task-system reminder: if the model has been silent on
|
||||
# task ops for too long but still has open tasks, drop
|
||||
# a steering reminder onto the injection queue. Drained
|
||||
# at the next iteration's 6b so it lands as the next
|
||||
# user turn via the normal injection path. Best-effort
|
||||
# — never raises.
|
||||
try:
|
||||
await self._maybe_inject_task_reminder(ctx, logged_tool_calls)
|
||||
except Exception:
|
||||
logger.debug("task reminder check failed", exc_info=True)
|
||||
await self._publish_llm_turn_complete(
|
||||
stream_id,
|
||||
node_id,
|
||||
@@ -897,6 +989,8 @@ class AgentLoop(AgentProtocol):
|
||||
input_tokens=turn_tokens.get("input", 0),
|
||||
output_tokens=turn_tokens.get("output", 0),
|
||||
cached_tokens=turn_tokens.get("cached", 0),
|
||||
cache_creation_tokens=turn_tokens.get("cache_creation", 0),
|
||||
cost_usd=float(turn_tokens.get("cost", 0.0) or 0.0),
|
||||
execution_id=execution_id,
|
||||
iteration=iteration,
|
||||
)
|
||||
@@ -911,6 +1005,7 @@ class AgentLoop(AgentProtocol):
|
||||
tool_calls=logged_tool_calls,
|
||||
tool_results=real_tool_results,
|
||||
token_counts=turn_tokens,
|
||||
tools=tools,
|
||||
)
|
||||
|
||||
# DS-13: inject context preservation warning once when token usage
|
||||
@@ -1080,7 +1175,7 @@ class AgentLoop(AgentProtocol):
|
||||
inner_turn=0,
|
||||
)
|
||||
await conversation.add_assistant_message(visible_error)
|
||||
await self._await_user_input(ctx, prompt="")
|
||||
await self._await_user_input(ctx)
|
||||
_llm_turn_failed_waiting_input = True
|
||||
break # exit retry loop, continue outer iteration
|
||||
|
||||
@@ -1129,7 +1224,7 @@ class AgentLoop(AgentProtocol):
|
||||
if _turn_cancelled:
|
||||
logger.info("[%s] iter=%d: turn cancelled by user", node_id, iteration)
|
||||
if ctx.supports_direct_user_io:
|
||||
await self._await_user_input(ctx, prompt="")
|
||||
await self._await_user_input(ctx)
|
||||
continue # back to top of for-iteration loop
|
||||
|
||||
# Queen non-transient LLM failures wait for user input and then
|
||||
@@ -1260,7 +1355,7 @@ class AgentLoop(AgentProtocol):
|
||||
iteration,
|
||||
_consecutive_empty_turns,
|
||||
)
|
||||
await self._await_user_input(ctx, prompt="")
|
||||
await self._await_user_input(ctx)
|
||||
_consecutive_empty_turns = 0
|
||||
else:
|
||||
await conversation.add_user_message(
|
||||
@@ -1336,7 +1431,6 @@ class AgentLoop(AgentProtocol):
|
||||
if tc.get("tool_name")
|
||||
not in (
|
||||
"ask_user",
|
||||
"ask_user_multiple",
|
||||
"escalate",
|
||||
)
|
||||
]
|
||||
@@ -1384,7 +1478,7 @@ class AgentLoop(AgentProtocol):
|
||||
recent_responses.clear()
|
||||
elif ctx.supports_direct_user_io:
|
||||
await conversation.add_user_message(warning_msg)
|
||||
await self._await_user_input(ctx, prompt=doom_desc)
|
||||
await self._await_user_input(ctx)
|
||||
recent_tool_fingerprints.clear()
|
||||
recent_responses.clear()
|
||||
else:
|
||||
@@ -1507,11 +1601,9 @@ class AgentLoop(AgentProtocol):
|
||||
# conversation — they flow through without blocking.
|
||||
_cf_block = False
|
||||
_cf_auto = False
|
||||
_cf_prompt = ""
|
||||
if ctx.supports_direct_user_io:
|
||||
if user_input_requested:
|
||||
_cf_block = True
|
||||
_cf_prompt = ask_user_prompt
|
||||
elif stream_id == "queen" and not real_tool_results and not outputs_set:
|
||||
# Auto-block: only for the queen (conversational node).
|
||||
# Workers are autonomous — they block only on explicit
|
||||
@@ -1614,13 +1706,13 @@ class AgentLoop(AgentProtocol):
|
||||
iteration,
|
||||
_cf_auto,
|
||||
)
|
||||
# Check for multi-question batch from ask_user_multiple
|
||||
multi_qs = getattr(self, "_pending_multi_questions", None)
|
||||
self._pending_multi_questions = None
|
||||
# Pull the pending questions array set by the ask_user
|
||||
# handler (a 1-item list for a single question, 2-8 for a
|
||||
# batch). None for auto-block turns with no explicit ask.
|
||||
pending_qs = getattr(self, "_pending_questions", None)
|
||||
self._pending_questions = None
|
||||
pending_input_state = {
|
||||
"prompt": _cf_prompt,
|
||||
"options": ask_user_options,
|
||||
"questions": multi_qs,
|
||||
"questions": pending_qs,
|
||||
"emit_client_request": True,
|
||||
}
|
||||
await self._write_cursor(
|
||||
@@ -1634,11 +1726,9 @@ class AgentLoop(AgentProtocol):
|
||||
)
|
||||
got_input = await self._await_user_input(
|
||||
ctx,
|
||||
prompt=_cf_prompt,
|
||||
options=ask_user_options,
|
||||
questions=multi_qs,
|
||||
questions=pending_qs,
|
||||
)
|
||||
# Emit deferred tool_call_completed for ask_user / ask_user_multiple
|
||||
# Emit deferred tool_call_completed for ask_user
|
||||
deferred = getattr(self, "_deferred_tool_complete", None)
|
||||
if deferred:
|
||||
self._deferred_tool_complete = None
|
||||
@@ -1803,7 +1893,7 @@ class AgentLoop(AgentProtocol):
|
||||
recent_tool_fingerprints=recent_tool_fingerprints,
|
||||
pending_input=pending_input_state,
|
||||
)
|
||||
got_input = await self._await_user_input(ctx, prompt="", emit_client_request=False)
|
||||
got_input = await self._await_user_input(ctx, emit_client_request=False)
|
||||
logger.info(
|
||||
"[%s] iter=%d: queen wait unblocked, got_input=%s",
|
||||
node_id,
|
||||
@@ -2195,9 +2285,7 @@ class AgentLoop(AgentProtocol):
|
||||
async def _await_user_input(
|
||||
self,
|
||||
ctx: AgentContext,
|
||||
prompt: str = "",
|
||||
*,
|
||||
options: list[str] | None = None,
|
||||
questions: list[dict] | None = None,
|
||||
emit_client_request: bool = True,
|
||||
) -> bool:
|
||||
@@ -2210,11 +2298,11 @@ class AgentLoop(AgentProtocol):
|
||||
before the judge runs.
|
||||
|
||||
Args:
|
||||
options: Optional predefined choices for the user (from ask_user).
|
||||
Passed through to the CLIENT_INPUT_REQUESTED event so the
|
||||
frontend can render a QuestionWidget with buttons.
|
||||
questions: Optional list of question dicts for ask_user_multiple.
|
||||
Each dict has id, prompt, and optional options.
|
||||
questions: Optional list of question dicts from ask_user. Each
|
||||
dict has id, prompt, and optional options. Passed through to
|
||||
the CLIENT_INPUT_REQUESTED event so the frontend can render
|
||||
the appropriate widget (QuestionWidget for one, else
|
||||
MultiQuestionWidget).
|
||||
emit_client_request: When False, wait silently without publishing
|
||||
CLIENT_INPUT_REQUESTED. Used for worker waits where input is
|
||||
expected from the queen via inject_message().
|
||||
@@ -2243,9 +2331,7 @@ class AgentLoop(AgentProtocol):
|
||||
await self._event_bus.emit_client_input_requested(
|
||||
stream_id=ctx.stream_id or ctx.agent_id,
|
||||
node_id=ctx.agent_id,
|
||||
prompt=prompt,
|
||||
execution_id=ctx.execution_id or "",
|
||||
options=options,
|
||||
questions=questions,
|
||||
)
|
||||
|
||||
@@ -2277,8 +2363,6 @@ class AgentLoop(AgentProtocol):
|
||||
dict[str, int],
|
||||
list[dict],
|
||||
bool,
|
||||
str,
|
||||
list[str] | None,
|
||||
bool,
|
||||
str,
|
||||
list[dict[str, Any]],
|
||||
@@ -2287,8 +2371,7 @@ class AgentLoop(AgentProtocol):
|
||||
"""Run a single LLM turn with streaming and tool execution.
|
||||
|
||||
Returns (assistant_text, real_tool_results, outputs_set, token_counts, logged_tool_calls,
|
||||
user_input_requested, ask_user_prompt, ask_user_options, queen_input_requested,
|
||||
system_prompt, messages, reported_to_parent).
|
||||
user_input_requested, queen_input_requested, system_prompt, messages, reported_to_parent).
|
||||
|
||||
``real_tool_results`` contains only results from actual tools (web_search,
|
||||
etc.), NOT from synthetic framework tools such as ``set_output``,
|
||||
@@ -2309,7 +2392,9 @@ class AgentLoop(AgentProtocol):
|
||||
stream_id = ctx.stream_id or ctx.agent_id
|
||||
node_id = ctx.agent_id
|
||||
execution_id = ctx.execution_id or ""
|
||||
token_counts: dict[str, int] = {"input": 0, "output": 0, "cached": 0}
|
||||
# Mixed-type dict: int token counts + str stop_reason/model + float cost.
|
||||
# Typed loosely to avoid churn in the many call sites that read from it.
|
||||
token_counts: dict[str, Any] = {"input": 0, "output": 0, "cached": 0, "cache_creation": 0, "cost": 0.0}
|
||||
tool_call_count = 0
|
||||
final_text = ""
|
||||
final_system_prompt = conversation.system_prompt
|
||||
@@ -2317,8 +2402,6 @@ class AgentLoop(AgentProtocol):
|
||||
# Track output keys set via set_output across all inner iterations
|
||||
outputs_set_this_turn: list[str] = []
|
||||
user_input_requested = False
|
||||
ask_user_prompt = ""
|
||||
ask_user_options: list[str] | None = None
|
||||
queen_input_requested = False
|
||||
# Accumulate ALL tool calls across inner iterations for L3 logging.
|
||||
# Unlike real_tool_results (reset each inner iteration), this persists.
|
||||
@@ -2452,9 +2535,16 @@ class AgentLoop(AgentProtocol):
|
||||
nonlocal _first_event_at
|
||||
_clean_snapshot = "" # visible-only text for the frontend
|
||||
|
||||
# Split-prompt path: pass STATIC and DYNAMIC tail separately
|
||||
# so the LLM wrapper can emit them as two Anthropic system
|
||||
# content blocks with a cache breakpoint between them. When
|
||||
# no split is in use, ``system_prompt_static`` equals the
|
||||
# full prompt and the suffix is empty — identical to the
|
||||
# legacy single-block request.
|
||||
async for event in ctx.llm.stream(
|
||||
messages=_msgs,
|
||||
system=conversation.system_prompt,
|
||||
system=conversation.system_prompt_static,
|
||||
system_dynamic_suffix=(conversation.system_prompt_dynamic_suffix or None),
|
||||
tools=tools if tools else None,
|
||||
max_tokens=ctx.max_tokens,
|
||||
):
|
||||
@@ -2535,6 +2625,8 @@ class AgentLoop(AgentProtocol):
|
||||
token_counts["input"] += event.input_tokens
|
||||
token_counts["output"] += event.output_tokens
|
||||
token_counts["cached"] += event.cached_tokens
|
||||
token_counts["cache_creation"] += event.cache_creation_tokens
|
||||
token_counts["cost"] = token_counts.get("cost", 0.0) + event.cost_usd
|
||||
token_counts["stop_reason"] = event.stop_reason
|
||||
token_counts["model"] = event.model
|
||||
|
||||
@@ -2808,8 +2900,6 @@ class AgentLoop(AgentProtocol):
|
||||
token_counts,
|
||||
logged_tool_calls,
|
||||
user_input_requested,
|
||||
ask_user_prompt,
|
||||
ask_user_options,
|
||||
queen_input_requested,
|
||||
final_system_prompt,
|
||||
final_messages,
|
||||
@@ -2859,7 +2949,17 @@ class AgentLoop(AgentProtocol):
|
||||
# nudge on its next turn without losing the real execution output.
|
||||
replay_prefixes_by_id: dict[str, str] = {}
|
||||
|
||||
# Schema-driven coercion of tool arguments. Heals the small
|
||||
# handful of drift patterns that non-frontier models emit
|
||||
# (numbers-as-strings, array-of-{label} wrappers, arrays
|
||||
# sent as JSON strings, singleton scalars). Runs once per
|
||||
# tool call before dispatch; see tool_input_coercer module.
|
||||
_tool_by_name = {t.name: t for t in tools}
|
||||
|
||||
for tc in tool_calls:
|
||||
_tool_schema = _tool_by_name.get(tc.tool_name)
|
||||
if _tool_schema is not None:
|
||||
coerce_tool_input(_tool_schema, tc.tool_input)
|
||||
tool_call_count += 1
|
||||
if hard_limit > 0 and tool_call_count > hard_limit:
|
||||
limit_hit = True
|
||||
@@ -2892,54 +2992,89 @@ class AgentLoop(AgentProtocol):
|
||||
|
||||
elif tc.tool_name == "ask_user":
|
||||
# --- Framework-level ask_user handling ---
|
||||
ask_user_prompt = tc.tool_input.get("question", "")
|
||||
raw_options = tc.tool_input.get("options", None)
|
||||
|
||||
# Self-heal: some model families (notably the queen
|
||||
# profile prompt poisoning the output style) cram
|
||||
# the options inside the question string as a
|
||||
# pseudo-XML blob like:
|
||||
#
|
||||
# "What do you want to do?</question>\n_OPTIONS:
|
||||
# [\"De-risk\", \"Add\", \"Short\"]"
|
||||
#
|
||||
# When that happens the question text leaks
|
||||
# </question> and _OPTIONS: into the chat UI and
|
||||
# the buttons never appear. Detect + repair.
|
||||
# The consolidated tool always takes a `questions`
|
||||
# array (1-8 entries). A single-entry array is the
|
||||
# common case; longer arrays batch several questions
|
||||
# into one turn so the user answers them all at once.
|
||||
from framework.agent_loop.internals.synthetic_tools import (
|
||||
sanitize_ask_user_inputs,
|
||||
)
|
||||
|
||||
ask_user_prompt, recovered_options = sanitize_ask_user_inputs(ask_user_prompt, raw_options)
|
||||
if recovered_options is not None and raw_options is None:
|
||||
raw_options = recovered_options
|
||||
# Defensive: ensure options is a list of strings.
|
||||
# Smaller models sometimes send a string instead of
|
||||
# an array — try to recover gracefully.
|
||||
ask_user_options: list[str] | None = None
|
||||
if isinstance(raw_options, list):
|
||||
ask_user_options = [str(o) for o in raw_options if o]
|
||||
elif isinstance(raw_options, str) and raw_options.strip():
|
||||
# Try JSON parse first (e.g. '["a","b"]')
|
||||
try:
|
||||
parsed = json.loads(raw_options)
|
||||
if isinstance(parsed, list):
|
||||
ask_user_options = [str(o) for o in parsed if o]
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
if ask_user_options is not None and len(ask_user_options) < 2:
|
||||
ask_user_options = None # fall back to free-text input
|
||||
|
||||
# Workers MUST provide at least 2 options — no free-text
|
||||
# questions allowed. Only the queen may omit options.
|
||||
if ask_user_options is None and stream_id != "queen":
|
||||
raw_questions = tc.tool_input.get("questions", None)
|
||||
if not isinstance(raw_questions, list) or not raw_questions:
|
||||
result = ToolResult(
|
||||
tool_use_id=tc.tool_use_id,
|
||||
content=(
|
||||
"ERROR: options are required. Provide at least "
|
||||
"2 predefined choices in the 'options' array. "
|
||||
'Example: {"question": "...", "options": '
|
||||
'["Yes", "No"]}'
|
||||
"ERROR: ask_user requires a non-empty "
|
||||
"'questions' array. Each entry must have "
|
||||
"{id, prompt, options?}. Example: "
|
||||
'{"questions": [{"id": "q1", "prompt": '
|
||||
'"What now?", "options": ["A", "B"]}]}'
|
||||
),
|
||||
is_error=True,
|
||||
)
|
||||
results_by_id[tc.tool_use_id] = result
|
||||
user_input_requested = False
|
||||
continue
|
||||
|
||||
# Normalize + self-heal each question entry. The
|
||||
# generic tool_input_coercer has already handled
|
||||
# schema-shape drift (array-of-string options, JSON
|
||||
# strings, etc.), so here we only deal with
|
||||
# prompt-style drift: some model families cram
|
||||
# options inside the prompt as a pseudo-XML blob
|
||||
# like "What now?</question>\n_OPTIONS: [\"A\", \"B\"]".
|
||||
# sanitize_ask_user_inputs strips the tag and
|
||||
# recovers the inline options as a fallback.
|
||||
questions: list[dict] = []
|
||||
for i, q in enumerate(raw_questions):
|
||||
if not isinstance(q, dict):
|
||||
continue
|
||||
qid = str(q.get("id", f"q{i + 1}"))
|
||||
raw_prompt = q.get("prompt", q.get("question", ""))
|
||||
raw_opts = q.get("options", None)
|
||||
cleaned_prompt, recovered_opts = sanitize_ask_user_inputs(raw_prompt, raw_opts)
|
||||
|
||||
opts: list[str] | None = None
|
||||
if isinstance(raw_opts, list) and raw_opts:
|
||||
opts = [str(o) for o in raw_opts if o]
|
||||
elif recovered_opts is not None:
|
||||
opts = recovered_opts
|
||||
if opts is not None and len(opts) < 2:
|
||||
opts = None # fall back to free-text
|
||||
|
||||
questions.append(
|
||||
{
|
||||
"id": qid,
|
||||
"prompt": cleaned_prompt,
|
||||
**({"options": opts} if opts else {}),
|
||||
}
|
||||
)
|
||||
|
||||
if not questions:
|
||||
result = ToolResult(
|
||||
tool_use_id=tc.tool_use_id,
|
||||
content=(
|
||||
"ERROR: no valid question objects in "
|
||||
"'questions'. Each entry must be an "
|
||||
"object with 'id' and 'prompt'."
|
||||
),
|
||||
is_error=True,
|
||||
)
|
||||
results_by_id[tc.tool_use_id] = result
|
||||
user_input_requested = False
|
||||
continue
|
||||
|
||||
# Workers MUST provide options on every question —
|
||||
# free-text asks are queen-only.
|
||||
if stream_id != "queen" and any("options" not in q for q in questions):
|
||||
result = ToolResult(
|
||||
tool_use_id=tc.tool_use_id,
|
||||
content=(
|
||||
"ERROR: options are required on every "
|
||||
"question for worker nodes. Provide at "
|
||||
"least 2 predefined choices in the "
|
||||
"'options' array of each question."
|
||||
),
|
||||
is_error=True,
|
||||
)
|
||||
@@ -2949,77 +3084,31 @@ class AgentLoop(AgentProtocol):
|
||||
|
||||
user_input_requested = True
|
||||
|
||||
# Free-form ask_user (no options): stream the question
|
||||
# text as a chat message so the user can see it. When
|
||||
# options are present the QuestionWidget shows the
|
||||
# question, but without options nothing renders it.
|
||||
if ask_user_options is None and ask_user_prompt and ctx.emits_client_io:
|
||||
# Single free-form question: stream the prompt as a
|
||||
# chat message so the user sees it. Widget-rendered
|
||||
# cases (single-with-options, multi) draw their own
|
||||
# question text, so no text delta is needed.
|
||||
if (
|
||||
len(questions) == 1
|
||||
and "options" not in questions[0]
|
||||
and questions[0]["prompt"]
|
||||
and ctx.emits_client_io
|
||||
):
|
||||
_q_text = questions[0]["prompt"]
|
||||
await self._publish_text_delta(
|
||||
stream_id,
|
||||
node_id,
|
||||
content=ask_user_prompt,
|
||||
snapshot=ask_user_prompt,
|
||||
content=_q_text,
|
||||
snapshot=_q_text,
|
||||
ctx=ctx,
|
||||
execution_id=execution_id,
|
||||
iteration=iteration,
|
||||
inner_turn=inner_turn,
|
||||
)
|
||||
|
||||
result = ToolResult(
|
||||
tool_use_id=tc.tool_use_id,
|
||||
content="Waiting for user input...",
|
||||
is_error=False,
|
||||
)
|
||||
results_by_id[tc.tool_use_id] = result
|
||||
|
||||
elif tc.tool_name == "ask_user_multiple":
|
||||
# --- Framework-level ask_user_multiple ---
|
||||
raw_questions = tc.tool_input.get("questions", [])
|
||||
if not isinstance(raw_questions, list) or len(raw_questions) < 2:
|
||||
result = ToolResult(
|
||||
tool_use_id=tc.tool_use_id,
|
||||
content=(
|
||||
"ERROR: questions must be an array of at "
|
||||
"least 2 question objects. Use ask_user "
|
||||
"for single questions."
|
||||
),
|
||||
is_error=True,
|
||||
)
|
||||
results_by_id[tc.tool_use_id] = result
|
||||
user_input_requested = False
|
||||
continue
|
||||
|
||||
# Normalize each question entry
|
||||
questions: list[dict] = []
|
||||
for i, q in enumerate(raw_questions):
|
||||
if not isinstance(q, dict):
|
||||
continue
|
||||
qid = str(q.get("id", f"q{i + 1}"))
|
||||
prompt = str(q.get("prompt", ""))
|
||||
opts = q.get("options", None)
|
||||
if isinstance(opts, list):
|
||||
opts = [str(o) for o in opts if o]
|
||||
if len(opts) < 2:
|
||||
opts = None
|
||||
else:
|
||||
opts = None
|
||||
questions.append(
|
||||
{
|
||||
"id": qid,
|
||||
"prompt": prompt,
|
||||
**({"options": opts} if opts else {}),
|
||||
}
|
||||
)
|
||||
|
||||
user_input_requested = True
|
||||
|
||||
# Store as multi-question prompt/options for
|
||||
# the event emission path
|
||||
ask_user_prompt = ""
|
||||
ask_user_options = None
|
||||
# Pass the full questions list via a special
|
||||
# key that the event emitter picks up
|
||||
self._pending_multi_questions = questions
|
||||
# Stash the normalized questions list for the
|
||||
# blocking path (§1612) + event emission.
|
||||
self._pending_questions = questions
|
||||
|
||||
result = ToolResult(
|
||||
tool_use_id=tc.tool_use_id,
|
||||
@@ -3330,6 +3419,30 @@ class AgentLoop(AgentProtocol):
|
||||
|
||||
# Phase 3: record results into conversation in original order,
|
||||
# build logged/real lists, and publish completed events.
|
||||
#
|
||||
# Vision-fallback prefetch: a single turn may fire several
|
||||
# image-producing tools in parallel (e.g. one screenshot
|
||||
# per tab). Captioning each one takes a vision LLM round
|
||||
# trip (1–30 s). Doing them sequentially in this loop
|
||||
# would serialise that latency per image. Instead, kick
|
||||
# off all caption tasks concurrently NOW, and await each
|
||||
# one just-in-time inside the per-tc body. If only a
|
||||
# single image needs captioning, this collapses to a
|
||||
# single await with no overhead.
|
||||
_model_text_only = ctx.llm and _vision_fallback_active(ctx.llm.model)
|
||||
caption_tasks: dict[str, asyncio.Task[tuple[str, str] | None]] = {}
|
||||
if _model_text_only:
|
||||
for tc in tool_calls[:executed_in_batch]:
|
||||
res = results_by_id.get(tc.tool_use_id)
|
||||
if not res or not res.image_content:
|
||||
continue
|
||||
intent = extract_intent_for_tool(
|
||||
conversation,
|
||||
tc.tool_name,
|
||||
tc.tool_input or {},
|
||||
)
|
||||
caption_tasks[tc.tool_use_id] = asyncio.create_task(_captioning_chain(intent, res.image_content))
|
||||
|
||||
for tc in tool_calls[:executed_in_batch]:
|
||||
result = results_by_id.get(tc.tool_use_id)
|
||||
if result is None:
|
||||
@@ -3338,7 +3451,6 @@ class AgentLoop(AgentProtocol):
|
||||
# Build log entries for real tools (exclude synthetic tools)
|
||||
if tc.tool_name not in (
|
||||
"ask_user",
|
||||
"ask_user_multiple",
|
||||
"escalate",
|
||||
):
|
||||
tool_entry = {
|
||||
@@ -3353,11 +3465,33 @@ class AgentLoop(AgentProtocol):
|
||||
logged_tool_calls.append(tool_entry)
|
||||
|
||||
image_content = result.image_content
|
||||
if image_content and ctx.llm and not supports_image_tool_results(ctx.llm.model):
|
||||
logger.info(
|
||||
"Stripping image_content from tool result; model '%s' does not support images in tool results",
|
||||
ctx.llm.model,
|
||||
)
|
||||
# Vision-fallback marker spliced into the persisted text
|
||||
# below. None when no captioning ran (vision-capable
|
||||
# main model, no images, or no fallback chain reached
|
||||
# this tool).
|
||||
vision_fallback_marker: str | None = None
|
||||
if image_content and tc.tool_use_id in caption_tasks:
|
||||
caption_result = await caption_tasks.pop(tc.tool_use_id)
|
||||
if caption_result:
|
||||
caption, vision_model = caption_result
|
||||
vision_fallback_marker = f"[vision-fallback caption]\n{caption}"
|
||||
logger.info(
|
||||
"vision_fallback: captioned %d image(s) for tool '%s' "
|
||||
"(main model '%s' routed through fallback model '%s')",
|
||||
len(image_content),
|
||||
tc.tool_name,
|
||||
ctx.llm.model if ctx.llm else "?",
|
||||
vision_model,
|
||||
)
|
||||
else:
|
||||
vision_fallback_marker = "[image stripped — vision fallback exhausted]"
|
||||
logger.info(
|
||||
"vision_fallback: exhausted; stripping %d image(s) from "
|
||||
"tool '%s' result without caption (model '%s')",
|
||||
len(image_content),
|
||||
tc.tool_name,
|
||||
ctx.llm.model if ctx.llm else "?",
|
||||
)
|
||||
image_content = None
|
||||
|
||||
# Apply replay-detector steer prefix if this call matched a
|
||||
@@ -3369,6 +3503,11 @@ class AgentLoop(AgentProtocol):
|
||||
if _prefix:
|
||||
stored_content = f"{_prefix}{stored_content or ''}"
|
||||
|
||||
# Splice the vision-fallback caption / placeholder into
|
||||
# the persisted text after any prefix has been applied.
|
||||
if vision_fallback_marker:
|
||||
stored_content = f"{stored_content or ''}\n\n{vision_fallback_marker}"
|
||||
|
||||
await conversation.add_tool_result(
|
||||
tool_use_id=tc.tool_use_id,
|
||||
content=stored_content,
|
||||
@@ -3376,7 +3515,7 @@ class AgentLoop(AgentProtocol):
|
||||
image_content=image_content,
|
||||
is_skill_content=result.is_skill_content,
|
||||
)
|
||||
if tc.tool_name in ("ask_user", "ask_user_multiple") and user_input_requested and not result.is_error:
|
||||
if tc.tool_name == "ask_user" and user_input_requested and not result.is_error:
|
||||
# Defer tool_call_completed until after user responds
|
||||
self._deferred_tool_complete = {
|
||||
"stream_id": stream_id,
|
||||
@@ -3457,8 +3596,6 @@ class AgentLoop(AgentProtocol):
|
||||
token_counts,
|
||||
logged_tool_calls,
|
||||
user_input_requested,
|
||||
ask_user_prompt,
|
||||
ask_user_options,
|
||||
queen_input_requested,
|
||||
final_system_prompt,
|
||||
final_messages,
|
||||
@@ -3509,8 +3646,6 @@ class AgentLoop(AgentProtocol):
|
||||
token_counts,
|
||||
logged_tool_calls,
|
||||
user_input_requested,
|
||||
ask_user_prompt,
|
||||
ask_user_options,
|
||||
queen_input_requested,
|
||||
final_system_prompt,
|
||||
final_messages,
|
||||
@@ -3530,10 +3665,6 @@ class AgentLoop(AgentProtocol):
|
||||
"""Build the synthetic ask_user tool. Delegates to synthetic_tools module."""
|
||||
return build_ask_user_tool()
|
||||
|
||||
def _build_ask_user_multiple_tool(self) -> Tool:
|
||||
"""Build the synthetic ask_user_multiple tool. Delegates to synthetic_tools module."""
|
||||
return build_ask_user_multiple_tool()
|
||||
|
||||
def _build_escalate_tool(self) -> Tool:
|
||||
"""Build the synthetic escalate tool. Delegates to synthetic_tools module."""
|
||||
return build_escalate_tool()
|
||||
@@ -4003,7 +4134,7 @@ class AgentLoop(AgentProtocol):
|
||||
queue=self._injection_queue,
|
||||
conversation=conversation,
|
||||
ctx=ctx,
|
||||
describe_images_as_text_fn=_describe_images_as_text,
|
||||
caption_image_fn=_captioning_chain,
|
||||
)
|
||||
|
||||
async def _drain_trigger_queue(self, conversation: NodeConversation) -> int:
|
||||
@@ -4067,6 +4198,74 @@ class AgentLoop(AgentProtocol):
|
||||
execution_id=execution_id,
|
||||
)
|
||||
|
||||
async def _maybe_inject_task_reminder(
|
||||
self,
|
||||
ctx: AgentContext,
|
||||
logged_tool_calls: list[dict[str, Any]] | None,
|
||||
) -> None:
|
||||
"""Layer 3 task-system steering — periodic reminder injection.
|
||||
|
||||
Called once per iteration after the LLM turn completes. If the
|
||||
model has been silent on task ops for a while AND there are open
|
||||
tasks on its session list, queue a system-style reminder onto
|
||||
the injection queue so the next iteration drains it as a user
|
||||
turn. Idempotent / safe to call always — gates internally.
|
||||
|
||||
``logged_tool_calls`` is a list of dicts with at least a "name"
|
||||
key, as accumulated by ``_run_single_turn``. Names like
|
||||
``task_create``, ``task_update``, ``colony_template_*`` reset
|
||||
the counter (see ``framework.tasks.reminders.TASK_OP_TOOL_NAMES``).
|
||||
"""
|
||||
from framework.tasks import get_task_store
|
||||
from framework.tasks.models import TaskStatus
|
||||
from framework.tasks.reminders import build_reminder, saw_task_op
|
||||
|
||||
state = self._task_reminder_state
|
||||
|
||||
# 1. Update counters based on this turn's tool calls.
|
||||
names: list[str] = []
|
||||
for call in logged_tool_calls or []:
|
||||
try:
|
||||
name = call.get("name") or call.get("tool_name")
|
||||
if name:
|
||||
names.append(name)
|
||||
except (AttributeError, TypeError):
|
||||
continue
|
||||
if saw_task_op(names):
|
||||
state.on_task_op()
|
||||
state.on_iteration()
|
||||
|
||||
# 2. Resolve the agent's task list. Skip if context isn't wired yet.
|
||||
list_id = getattr(ctx, "task_list_id", None)
|
||||
if not list_id:
|
||||
return
|
||||
|
||||
# 3. Read the open-task snapshot. Best-effort.
|
||||
try:
|
||||
store = get_task_store()
|
||||
records = await store.list_tasks(list_id)
|
||||
except Exception:
|
||||
return
|
||||
open_tasks = [r for r in records if r.status != TaskStatus.COMPLETED]
|
||||
if not state.should_remind(bool(open_tasks)):
|
||||
return
|
||||
|
||||
body = build_reminder(records)
|
||||
if not body:
|
||||
return
|
||||
|
||||
# 4. Enqueue. Drained at the next iteration's 6b drain step and
|
||||
# rendered as a user turn (with the "[External event]" prefix).
|
||||
await self._injection_queue.put((body, False, None))
|
||||
state.on_reminder_sent()
|
||||
logger.info(
|
||||
"[task-reminder] queued nudge for %s (open=%d, silent_turns=%d)",
|
||||
list_id,
|
||||
len(open_tasks),
|
||||
state.turns_since_task_op,
|
||||
)
|
||||
self._bump("task_reminders_sent")
|
||||
|
||||
async def _run_hooks(
|
||||
self,
|
||||
event: str,
|
||||
@@ -4128,6 +4327,8 @@ class AgentLoop(AgentProtocol):
|
||||
input_tokens: int,
|
||||
output_tokens: int,
|
||||
cached_tokens: int = 0,
|
||||
cache_creation_tokens: int = 0,
|
||||
cost_usd: float = 0.0,
|
||||
execution_id: str = "",
|
||||
iteration: int | None = None,
|
||||
) -> None:
|
||||
@@ -4140,6 +4341,8 @@ class AgentLoop(AgentProtocol):
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens,
|
||||
cached_tokens=cached_tokens,
|
||||
cache_creation_tokens=cache_creation_tokens,
|
||||
cost_usd=cost_usd,
|
||||
execution_id=execution_id,
|
||||
iteration=iteration,
|
||||
)
|
||||
|
||||
@@ -56,6 +56,16 @@ class Message:
|
||||
# from a crashed or watchdog-cancelled stream. Signals that the original
|
||||
# turn never finished — the model may or may not choose to redo it.
|
||||
truncated: bool = False
|
||||
# When non-None, identifies the parent session id this message was
|
||||
# carried over from — used by fork_session_into_colony on the single
|
||||
# compacted-summary message it writes when a colony is born from a
|
||||
# queen DM. Presence of the field IS the "inherited" signal.
|
||||
inherited_from: str | None = None
|
||||
# True when this user message was synthesized from one or more
|
||||
# fired triggers (timer/webhook), not typed by a human. The LLM still
|
||||
# sees the message as a regular user turn; the UI uses this flag to
|
||||
# render it as a trigger banner instead of a speech bubble.
|
||||
is_trigger: bool = False
|
||||
|
||||
def to_llm_dict(self) -> dict[str, Any]:
|
||||
"""Convert to OpenAI-format message dict."""
|
||||
@@ -121,6 +131,10 @@ class Message:
|
||||
d["is_system_nudge"] = self.is_system_nudge
|
||||
if self.truncated:
|
||||
d["truncated"] = self.truncated
|
||||
if self.inherited_from is not None:
|
||||
d["inherited_from"] = self.inherited_from
|
||||
if self.is_trigger:
|
||||
d["is_trigger"] = self.is_trigger
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
@@ -140,6 +154,8 @@ class Message:
|
||||
run_id=data.get("run_id"),
|
||||
is_system_nudge=data.get("is_system_nudge", False),
|
||||
truncated=data.get("truncated", False),
|
||||
inherited_from=data.get("inherited_from"),
|
||||
is_trigger=data.get("is_trigger", False),
|
||||
)
|
||||
|
||||
|
||||
@@ -411,9 +427,20 @@ class NodeConversation:
|
||||
store: ConversationStore | None = None,
|
||||
run_id: str | None = None,
|
||||
compaction_buffer_tokens: int | None = None,
|
||||
compaction_buffer_ratio: float | None = None,
|
||||
compaction_warning_buffer_tokens: int | None = None,
|
||||
) -> None:
|
||||
self._system_prompt = system_prompt
|
||||
# Optional split: when a caller updates the prompt with a
|
||||
# ``dynamic_suffix`` argument, we remember the static prefix and
|
||||
# suffix separately so the LLM wrapper can emit them as two
|
||||
# Anthropic system content blocks with a cache breakpoint between
|
||||
# them. ``_system_prompt`` stays as the concatenated form used for
|
||||
# persistence and for the legacy single-block LLM path.
|
||||
# On restore, these default to the concat/empty pair — the next
|
||||
# AgentLoop iteration's dynamic-prompt refresh step repopulates.
|
||||
self._system_prompt_static: str = system_prompt
|
||||
self._system_prompt_dynamic_suffix: str = ""
|
||||
self._max_context_tokens = max_context_tokens
|
||||
self._compaction_threshold = compaction_threshold
|
||||
# Buffer-based compaction trigger (Gap 7). When set, takes
|
||||
@@ -423,6 +450,11 @@ class NodeConversation:
|
||||
# limit. If left as None the legacy threshold-based rule is
|
||||
# used, keeping old call sites behaving identically.
|
||||
self._compaction_buffer_tokens = compaction_buffer_tokens
|
||||
# Ratio component of the hybrid buffer. Combines additively with
|
||||
# _compaction_buffer_tokens so callers can express "reserve N tokens
|
||||
# plus M% of the window" — the absolute floor matters on tiny
|
||||
# windows, the ratio matters on large ones.
|
||||
self._compaction_buffer_ratio = compaction_buffer_ratio
|
||||
self._compaction_warning_buffer_tokens = compaction_warning_buffer_tokens
|
||||
self._output_keys = output_keys
|
||||
self._store = store
|
||||
@@ -437,15 +469,56 @@ class NodeConversation:
|
||||
|
||||
@property
|
||||
def system_prompt(self) -> str:
|
||||
"""Full concatenated system prompt (static + dynamic suffix, if any).
|
||||
|
||||
This is the canonical form used for persistence and for the legacy
|
||||
single-block LLM path. Split-prompt callers should read
|
||||
``system_prompt_static`` and ``system_prompt_dynamic_suffix`` instead.
|
||||
"""
|
||||
return self._system_prompt
|
||||
|
||||
def update_system_prompt(self, new_prompt: str) -> None:
|
||||
@property
|
||||
def system_prompt_static(self) -> str:
|
||||
"""Static prefix of the system prompt (cache-stable).
|
||||
|
||||
Equals ``system_prompt`` when no split is in use. When the AgentLoop
|
||||
calls ``update_system_prompt(static, dynamic_suffix=...)``, this is
|
||||
the piece sent as the cache-controlled first block.
|
||||
"""
|
||||
return self._system_prompt_static
|
||||
|
||||
@property
|
||||
def system_prompt_dynamic_suffix(self) -> str:
|
||||
"""Dynamic tail of the system prompt (not cached).
|
||||
|
||||
Empty unless the consumer splits its prompt. The LLM wrapper uses a
|
||||
non-empty suffix to emit a two-block system content list with a
|
||||
cache breakpoint between the static prefix and this tail.
|
||||
"""
|
||||
return self._system_prompt_dynamic_suffix
|
||||
|
||||
def update_system_prompt(self, new_prompt: str, dynamic_suffix: str | None = None) -> None:
|
||||
"""Update the system prompt.
|
||||
|
||||
Used in continuous conversation mode at phase transitions to swap
|
||||
Layer 3 (focus) while preserving the conversation history.
|
||||
|
||||
When ``dynamic_suffix`` is provided, ``new_prompt`` is interpreted as
|
||||
the STATIC prefix and ``dynamic_suffix`` as the per-turn tail; they
|
||||
travel to the LLM as two separate cache-controlled blocks but are
|
||||
persisted as a single concatenated string for backward-compat
|
||||
restore. ``new_prompt`` alone (suffix left None) keeps the legacy
|
||||
single-string behavior.
|
||||
"""
|
||||
self._system_prompt = new_prompt
|
||||
if dynamic_suffix is None:
|
||||
# Legacy single-string path — static == full, no suffix split.
|
||||
self._system_prompt = new_prompt
|
||||
self._system_prompt_static = new_prompt
|
||||
self._system_prompt_dynamic_suffix = ""
|
||||
else:
|
||||
self._system_prompt_static = new_prompt
|
||||
self._system_prompt_dynamic_suffix = dynamic_suffix
|
||||
self._system_prompt = f"{new_prompt}\n\n{dynamic_suffix}" if dynamic_suffix else new_prompt
|
||||
self._meta_persisted = False # re-persist with new prompt
|
||||
|
||||
def set_current_phase(self, phase_id: str) -> None:
|
||||
@@ -485,6 +558,7 @@ class NodeConversation:
|
||||
is_client_input: bool = False,
|
||||
image_content: list[dict[str, Any]] | None = None,
|
||||
is_system_nudge: bool = False,
|
||||
is_trigger: bool = False,
|
||||
) -> Message:
|
||||
msg = Message(
|
||||
seq=self._next_seq,
|
||||
@@ -496,6 +570,7 @@ class NodeConversation:
|
||||
is_client_input=is_client_input,
|
||||
image_content=image_content,
|
||||
is_system_nudge=is_system_nudge,
|
||||
is_trigger=is_trigger,
|
||||
)
|
||||
self._messages.append(msg)
|
||||
self._next_seq += 1
|
||||
@@ -829,19 +904,30 @@ class NodeConversation:
|
||||
"""True when the conversation should be compacted before the
|
||||
next LLM call.
|
||||
|
||||
Buffer-based rule (Gap 7): trigger when the current estimate
|
||||
plus the configured buffer would exceed the hard context limit.
|
||||
Prevents compaction from firing only AFTER we're already over
|
||||
the wire and forced into a reactive binary-split pass.
|
||||
Hybrid buffer rule: the headroom reserved before compaction fires
|
||||
is the SUM of an absolute fixed component and a ratio of the hard
|
||||
context limit:
|
||||
|
||||
When no buffer is configured, falls back to the multiplicative
|
||||
threshold the old callers were built around.
|
||||
effective_buffer = compaction_buffer_tokens
|
||||
+ compaction_buffer_ratio * max_context_tokens
|
||||
|
||||
The fixed component gives a floor on tiny windows; the ratio
|
||||
keeps the trigger meaningful on large windows where any constant
|
||||
buffer becomes a rounding error (an 8k buffer is 75% on a 32k
|
||||
window but 96% on a 200k window). Compaction fires when the
|
||||
current estimate would consume more than (limit - effective_buffer).
|
||||
|
||||
When neither component is configured, falls back to the legacy
|
||||
multiplicative threshold so old callers keep behaving identically.
|
||||
"""
|
||||
if self._max_context_tokens <= 0:
|
||||
return False
|
||||
if self._compaction_buffer_tokens is not None:
|
||||
budget = self._max_context_tokens - self._compaction_buffer_tokens
|
||||
return self.estimate_tokens() >= max(0, budget)
|
||||
fixed = self._compaction_buffer_tokens
|
||||
ratio = self._compaction_buffer_ratio
|
||||
if fixed is not None or ratio is not None:
|
||||
effective_buffer = (fixed or 0) + (ratio or 0.0) * self._max_context_tokens
|
||||
budget = self._max_context_tokens - effective_buffer
|
||||
return self.estimate_tokens() >= max(0.0, budget)
|
||||
return self.estimate_tokens() >= self._max_context_tokens * self._compaction_threshold
|
||||
|
||||
def compaction_warning(self) -> bool:
|
||||
@@ -1498,6 +1584,7 @@ class NodeConversation:
|
||||
"max_context_tokens": self._max_context_tokens,
|
||||
"compaction_threshold": self._compaction_threshold,
|
||||
"compaction_buffer_tokens": self._compaction_buffer_tokens,
|
||||
"compaction_buffer_ratio": self._compaction_buffer_ratio,
|
||||
"compaction_warning_buffer_tokens": (self._compaction_warning_buffer_tokens),
|
||||
"output_keys": self._output_keys,
|
||||
}
|
||||
@@ -1547,6 +1634,7 @@ class NodeConversation:
|
||||
store=store,
|
||||
run_id=run_id,
|
||||
compaction_buffer_tokens=meta.get("compaction_buffer_tokens"),
|
||||
compaction_buffer_ratio=meta.get("compaction_buffer_ratio"),
|
||||
compaction_warning_buffer_tokens=meta.get("compaction_warning_buffer_tokens"),
|
||||
)
|
||||
conv._meta_persisted = True
|
||||
|
||||
@@ -16,7 +16,6 @@ import os
|
||||
import re
|
||||
import time
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from framework.agent_loop.conversation import Message, NodeConversation
|
||||
@@ -31,19 +30,38 @@ logger = logging.getLogger(__name__)
|
||||
LLM_COMPACT_CHAR_LIMIT: int = 240_000
|
||||
LLM_COMPACT_MAX_DEPTH: int = 10
|
||||
|
||||
# Microcompaction: tools whose results can be safely cleared
|
||||
# Microcompaction: tools whose results can be safely cleared from context
|
||||
# because the agent can re-derive them on demand. The bar for inclusion is
|
||||
# "old result has no irreversible value": file content can be re-read, a
|
||||
# search can be re-run, a screenshot can be re-captured, terminal output can
|
||||
# be re-fetched, etc. Write / edit results are short confirmations whose
|
||||
# value is in the side effect, not the message — also fair game.
|
||||
COMPACTABLE_TOOLS: frozenset[str] = frozenset(
|
||||
{
|
||||
# File ops — content lives on disk, re-readable.
|
||||
"read_file",
|
||||
"run_command",
|
||||
"web_search",
|
||||
"web_fetch",
|
||||
"grep_search",
|
||||
"glob_search",
|
||||
"search_files",
|
||||
"write_file",
|
||||
"edit_file",
|
||||
"pdf_read",
|
||||
# Terminal — re-runnable; advanced job/output tools produce verbose
|
||||
# logs whose recent state is what matters.
|
||||
"terminal_exec",
|
||||
"terminal_rg",
|
||||
"terminal_find",
|
||||
"terminal_output_get",
|
||||
"terminal_job_logs",
|
||||
# Web / research — pages and queries can be re-fetched.
|
||||
"web_scrape",
|
||||
"search_papers",
|
||||
"download_paper",
|
||||
"search_wikipedia",
|
||||
# Browser read-only inspection — current page state is what matters,
|
||||
# old snapshots are stale by definition.
|
||||
"browser_screenshot",
|
||||
"list_directory",
|
||||
"browser_snapshot",
|
||||
"browser_html",
|
||||
"browser_get_text",
|
||||
}
|
||||
)
|
||||
|
||||
@@ -371,6 +389,7 @@ async def llm_compact(
|
||||
char_limit: int = LLM_COMPACT_CHAR_LIMIT,
|
||||
max_depth: int = LLM_COMPACT_MAX_DEPTH,
|
||||
max_context_tokens: int = 128_000,
|
||||
preserve_user_messages: bool = False,
|
||||
) -> str:
|
||||
"""Summarise *messages* with LLM, splitting recursively if too large.
|
||||
|
||||
@@ -378,6 +397,11 @@ async def llm_compact(
|
||||
rejects the call with a context-length error, the messages are split
|
||||
in half and each half is summarised independently. Tool history is
|
||||
appended once at the top-level call (``_depth == 0``).
|
||||
|
||||
When ``preserve_user_messages`` is True, the prompt and system message
|
||||
are amplified to instruct the LLM to keep every user message verbatim
|
||||
and in full — used by the manual /compact-and-fork endpoint where the
|
||||
user wants their voice carried into the new session intact.
|
||||
"""
|
||||
from framework.agent_loop.conversation import extract_tool_call_history
|
||||
from framework.agent_loop.internals.tool_result_handler import is_context_too_large_error
|
||||
@@ -401,6 +425,7 @@ async def llm_compact(
|
||||
char_limit=char_limit,
|
||||
max_depth=max_depth,
|
||||
max_context_tokens=max_context_tokens,
|
||||
preserve_user_messages=preserve_user_messages,
|
||||
)
|
||||
else:
|
||||
prompt = build_llm_compaction_prompt(
|
||||
@@ -408,17 +433,30 @@ async def llm_compact(
|
||||
accumulator,
|
||||
formatted,
|
||||
max_context_tokens=max_context_tokens,
|
||||
preserve_user_messages=preserve_user_messages,
|
||||
)
|
||||
if preserve_user_messages:
|
||||
system_msg = (
|
||||
"You are a conversation compactor for an AI agent. "
|
||||
"Write a detailed summary that allows the agent to "
|
||||
"continue its work. CRITICAL: reproduce every user "
|
||||
"message verbatim and in full inside the 'User Messages' "
|
||||
"section — do not paraphrase, truncate, or merge them. "
|
||||
"Assistant turns and tool results may be summarised, but "
|
||||
"user input is sacred."
|
||||
)
|
||||
else:
|
||||
system_msg = (
|
||||
"You are a conversation compactor for an AI agent. "
|
||||
"Write a detailed summary that allows the agent to "
|
||||
"continue its work. Preserve user-stated rules, "
|
||||
"constraints, and account/identity preferences verbatim."
|
||||
)
|
||||
summary_budget = max(1024, max_context_tokens // 2)
|
||||
try:
|
||||
response = await ctx.llm.acomplete(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
system=(
|
||||
"You are a conversation compactor for an AI agent. "
|
||||
"Write a detailed summary that allows the agent to "
|
||||
"continue its work. Preserve user-stated rules, "
|
||||
"constraints, and account/identity preferences verbatim."
|
||||
),
|
||||
system=system_msg,
|
||||
max_tokens=summary_budget,
|
||||
)
|
||||
summary = response.content
|
||||
@@ -437,6 +475,7 @@ async def llm_compact(
|
||||
char_limit=char_limit,
|
||||
max_depth=max_depth,
|
||||
max_context_tokens=max_context_tokens,
|
||||
preserve_user_messages=preserve_user_messages,
|
||||
)
|
||||
else:
|
||||
raise
|
||||
@@ -459,6 +498,7 @@ async def _llm_compact_split(
|
||||
char_limit: int = LLM_COMPACT_CHAR_LIMIT,
|
||||
max_depth: int = LLM_COMPACT_MAX_DEPTH,
|
||||
max_context_tokens: int = 128_000,
|
||||
preserve_user_messages: bool = False,
|
||||
) -> str:
|
||||
"""Split messages in half and summarise each half independently."""
|
||||
mid = max(1, len(messages) // 2)
|
||||
@@ -470,6 +510,7 @@ async def _llm_compact_split(
|
||||
char_limit=char_limit,
|
||||
max_depth=max_depth,
|
||||
max_context_tokens=max_context_tokens,
|
||||
preserve_user_messages=preserve_user_messages,
|
||||
)
|
||||
s2 = await llm_compact(
|
||||
ctx,
|
||||
@@ -479,6 +520,7 @@ async def _llm_compact_split(
|
||||
char_limit=char_limit,
|
||||
max_depth=max_depth,
|
||||
max_context_tokens=max_context_tokens,
|
||||
preserve_user_messages=preserve_user_messages,
|
||||
)
|
||||
return s1 + "\n\n" + s2
|
||||
|
||||
@@ -510,6 +552,7 @@ def build_llm_compaction_prompt(
|
||||
formatted_messages: str,
|
||||
*,
|
||||
max_context_tokens: int = 128_000,
|
||||
preserve_user_messages: bool = False,
|
||||
) -> str:
|
||||
"""Build prompt for LLM compaction targeting 50% of token budget.
|
||||
|
||||
@@ -539,6 +582,18 @@ def build_llm_compaction_prompt(
|
||||
target_chars = target_tokens * 4
|
||||
node_ctx = "\n".join(ctx_lines)
|
||||
|
||||
user_messages_section = (
|
||||
"6. **User Messages** — Reproduce EVERY user message verbatim and "
|
||||
"in full, in chronological order, each on its own line prefixed "
|
||||
'with the message index (e.g. "[U1] ..."). Do NOT paraphrase, '
|
||||
"summarise, merge, or omit any user message. Preserve markdown, "
|
||||
"code fences, whitespace, and punctuation exactly as the user "
|
||||
"wrote them.\n"
|
||||
if preserve_user_messages
|
||||
else "6. **User Messages** — Preserve ALL user-stated rules, constraints, "
|
||||
"identity preferences, and account details verbatim.\n"
|
||||
)
|
||||
|
||||
return (
|
||||
"You are compacting an AI agent's conversation history. "
|
||||
"The agent is still working and needs to continue.\n\n"
|
||||
@@ -559,8 +614,7 @@ def build_llm_compaction_prompt(
|
||||
"resolved. Include root causes so the agent doesn't repeat them.\n"
|
||||
"5. **Problem Solving Efforts** — Approaches tried, dead ends hit, "
|
||||
"and reasoning behind the current strategy.\n"
|
||||
"6. **User Messages** — Preserve ALL user-stated rules, constraints, "
|
||||
"identity preferences, and account details verbatim.\n"
|
||||
f"{user_messages_section}"
|
||||
"7. **Pending Tasks** — Work remaining, outputs still needed, and "
|
||||
"any blockers.\n"
|
||||
"8. **Current Work** — The most recent action taken and the immediate "
|
||||
@@ -621,8 +675,10 @@ def write_compaction_debug_log(
|
||||
level: str,
|
||||
inventory: list[dict[str, Any]] | None,
|
||||
) -> None:
|
||||
"""Write detailed compaction analysis to ~/.hive/compaction_log/."""
|
||||
log_dir = Path.home() / ".hive" / "compaction_log"
|
||||
"""Write detailed compaction analysis to $HIVE_HOME/compaction_log/."""
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
log_dir = HIVE_HOME / "compaction_log"
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
ts = datetime.now(UTC).strftime("%Y%m%dT%H%M%S_%f")
|
||||
@@ -821,7 +877,7 @@ def build_emergency_summary(
|
||||
if not all_files:
|
||||
parts.append(
|
||||
"NOTE: Large tool results may have been saved to files. "
|
||||
"Use list_directory to check the data directory."
|
||||
"Use search_files(target='files', path='.') to check the data directory."
|
||||
)
|
||||
except Exception:
|
||||
parts.append("NOTE: Large tool results were saved to files. Use read_file(path='<path>') to read them.")
|
||||
|
||||
@@ -12,6 +12,7 @@ import json
|
||||
import logging
|
||||
from collections.abc import Awaitable, Callable
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from framework.agent_loop.conversation import ConversationStore, NodeConversation
|
||||
@@ -161,9 +162,18 @@ async def drain_injection_queue(
|
||||
conversation: NodeConversation,
|
||||
*,
|
||||
ctx: NodeContext,
|
||||
describe_images_as_text_fn: (Callable[[list[dict[str, Any]]], Awaitable[str | None]] | None) = None,
|
||||
caption_image_fn: (Callable[[str, list[dict[str, Any]]], Awaitable[tuple[str, str] | None]] | None) = None,
|
||||
) -> int:
|
||||
"""Drain all pending injected events as user messages. Returns count."""
|
||||
"""Drain all pending injected events as user messages. Returns count.
|
||||
|
||||
``caption_image_fn`` is the unified vision fallback hook. It takes
|
||||
``(intent, image_content)`` and returns ``(caption, model)`` on
|
||||
success — the model id is logged so the destination is observable.
|
||||
The user's typed ``content`` (the injected message body) is passed
|
||||
as the intent so the captioner can answer the user's specific
|
||||
question about the image rather than producing a generic
|
||||
description; an empty content falls back to a generic intent.
|
||||
"""
|
||||
count = 0
|
||||
logger.debug(
|
||||
"[drain_injection_queue] Starting to drain queue, initial queue size: %s",
|
||||
@@ -183,23 +193,34 @@ async def drain_injection_queue(
|
||||
"Model '%s' does not support images; attempting vision fallback",
|
||||
ctx.llm.model,
|
||||
)
|
||||
if describe_images_as_text_fn is not None:
|
||||
description = await describe_images_as_text_fn(image_content)
|
||||
if description:
|
||||
if caption_image_fn is not None:
|
||||
intent = content or ("Describe these user-injected images for a text-only agent.")
|
||||
caption_result = await caption_image_fn(intent, image_content)
|
||||
if caption_result:
|
||||
description, vision_model = caption_result
|
||||
content = f"{content}\n\n{description}" if content else description
|
||||
logger.info("[drain] image described as text via vision fallback")
|
||||
logger.info(
|
||||
"[drain] image described as text via vision fallback (model '%s')",
|
||||
vision_model,
|
||||
)
|
||||
else:
|
||||
logger.info("[drain] no vision fallback available; images dropped")
|
||||
image_content = None
|
||||
# Real user input is stored as-is; external events get a prefix
|
||||
# Stamp every injected event with its arrival time so the model
|
||||
# has a consistent temporal log to reason over (and so the
|
||||
# stamp lives inside byte-stable conversation history instead
|
||||
# of a per-turn system-prompt tail). Minute precision is what
|
||||
# the queen needs for conversational / scheduling context.
|
||||
stamp = datetime.now().astimezone().strftime("%Y-%m-%d %H:%M %Z")
|
||||
if is_client_input:
|
||||
stamped = f"[{stamp}] {content}" if content else f"[{stamp}]"
|
||||
await conversation.add_user_message(
|
||||
content,
|
||||
stamped,
|
||||
is_client_input=True,
|
||||
image_content=image_content,
|
||||
)
|
||||
else:
|
||||
await conversation.add_user_message(f"[External event]: {content}")
|
||||
await conversation.add_user_message(f"[{stamp}] [External event] {content}")
|
||||
count += 1
|
||||
except asyncio.QueueEmpty:
|
||||
break
|
||||
@@ -232,9 +253,12 @@ async def drain_trigger_queue(
|
||||
payload_str = json.dumps(t.payload, default=str)
|
||||
parts.append(f"[TRIGGER: {t.trigger_type}/{t.source_id}]{task_line}\n{payload_str}")
|
||||
|
||||
combined = "\n\n".join(parts)
|
||||
stamp = datetime.now().astimezone().strftime("%Y-%m-%d %H:%M %Z")
|
||||
combined = f"[{stamp}]\n" + "\n\n".join(parts)
|
||||
logger.info("[drain] %d trigger(s): %s", len(triggers), combined[:200])
|
||||
await conversation.add_user_message(combined)
|
||||
# Tag the message so the UI can render a banner instead of the raw
|
||||
# `[TRIGGER: ...]` text. The LLM still sees `combined` verbatim.
|
||||
await conversation.add_user_message(combined, is_trigger=True)
|
||||
return len(triggers)
|
||||
|
||||
|
||||
|
||||
@@ -108,6 +108,8 @@ async def publish_llm_turn_complete(
|
||||
input_tokens: int,
|
||||
output_tokens: int,
|
||||
cached_tokens: int = 0,
|
||||
cache_creation_tokens: int = 0,
|
||||
cost_usd: float = 0.0,
|
||||
execution_id: str = "",
|
||||
iteration: int | None = None,
|
||||
) -> None:
|
||||
@@ -120,6 +122,8 @@ async def publish_llm_turn_complete(
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens,
|
||||
cached_tokens=cached_tokens,
|
||||
cache_creation_tokens=cache_creation_tokens,
|
||||
cost_usd=cost_usd,
|
||||
execution_id=execution_id,
|
||||
iteration=iteration,
|
||||
)
|
||||
|
||||
@@ -91,108 +91,66 @@ def sanitize_ask_user_inputs(
|
||||
return q, recovered
|
||||
|
||||
|
||||
ask_user_prompt = """\
|
||||
Use this tool when you need to ask the user questions during execution. Reach for it when:
|
||||
|
||||
- The task is ambiguous and the user needs to choose an approach
|
||||
- You need missing information to continue
|
||||
- You want approval before taking a meaningful action
|
||||
- A decision has real trade-offs the user should weigh in on
|
||||
- You want post-task feedback, or to offer saving a skill or updating memory
|
||||
|
||||
Usage notes:
|
||||
- Users will always be able to select "Other" to provide custom text input, \
|
||||
so do not include catch-all options like "Other" or "Something else" yourself.
|
||||
- Each option is a plain string. Do NOT wrap options in `{"label": "..."}` or \
|
||||
`{"value": "..."}` objects — pass the raw choice text directly, e.g. `"Email"`, \
|
||||
not `{"label": "Email"}`.
|
||||
- If you recommend a specific option, make that the first option in the list \
|
||||
and append " (Recommended)" to the end of its text.
|
||||
- Call this tool whenever you need the user's response.
|
||||
- The prompt field must be plain text only.
|
||||
- Do not include XML, pseudo-tags, or inline option lists inside prompt.
|
||||
- Omit options only when the question truly requires a free-form response the \
|
||||
user must type out, such as describing an idea or pasting an error message.
|
||||
- Do not repeat the questions in your normal text response. The widget renders \
|
||||
them, so keep any surrounding text to a brief intro only.
|
||||
Example — single question with options:
|
||||
{"questions": [{"id": "next", "prompt": "What would you like to do?", \
|
||||
"options": ["Build a new agent (Recommended)", "Modify existing agent", "Run tests"]}]}
|
||||
|
||||
Example — batch:
|
||||
{"questions": [
|
||||
{"id": "scope", "prompt": "What scope?", "options": ["Full", "Partial"]},
|
||||
{"id": "format", "prompt": "Output format?", "options": ["PDF", "CSV", "JSON"]},
|
||||
{"id": "details", "prompt": "Any special requirements?"}
|
||||
]}
|
||||
|
||||
Example — free-form (queen only):
|
||||
{"questions": [{"id": "idea", "prompt": "Describe the agent you want to build."}]}
|
||||
"""
|
||||
|
||||
|
||||
def build_ask_user_tool() -> Tool:
|
||||
"""Build the synthetic ask_user tool for explicit user-input requests.
|
||||
|
||||
The queen calls ask_user() when it needs to pause and wait
|
||||
for user input. Text-only turns WITHOUT ask_user flow through without
|
||||
blocking, allowing progress updates and summaries to stream freely.
|
||||
The queen calls ask_user() when it needs to pause and wait for user
|
||||
input. Accepts an array of 1-8 questions — a single question for the
|
||||
common case, or a batch when several clarifications are needed at once.
|
||||
Text-only turns WITHOUT ask_user flow through without blocking, allowing
|
||||
progress updates and summaries to stream freely.
|
||||
"""
|
||||
return Tool(
|
||||
name="ask_user",
|
||||
description=(
|
||||
"You MUST call this tool whenever you need the user's response. "
|
||||
"Always call it after greeting the user, asking a question, or "
|
||||
"requesting approval. Do NOT call it for status updates or "
|
||||
"summaries that don't require a response.\n\n"
|
||||
"STRUCTURE RULES (CRITICAL):\n"
|
||||
"- The 'question' field is PLAIN TEXT shown to the user. Do NOT "
|
||||
"include XML tags, pseudo-tags like </question>, or option lists "
|
||||
"in the question string. The UI does not parse them — they "
|
||||
"render as raw text and look broken.\n"
|
||||
"- The 'options' parameter is the ONLY way to render buttons. "
|
||||
"If you want buttons, put them in the 'options' array, not in "
|
||||
"the question string. Do NOT write 'OPTIONS: [...]', "
|
||||
"'_options: [...]', or any inline list inside 'question'.\n"
|
||||
"- The question text must read as a single clean prompt with "
|
||||
"no markup. Example: 'What would you like to do?' — not "
|
||||
"'What would you like to do?</question>'.\n\n"
|
||||
"USAGE:\n"
|
||||
"Always include 2-3 predefined options. The UI automatically "
|
||||
"appends an 'Other' free-text input after your options, so NEVER "
|
||||
"include catch-all options like 'Custom idea', 'Something else', "
|
||||
"'Other', or 'None of the above' — the UI handles that. "
|
||||
"When the question primarily needs a typed answer but you must "
|
||||
"include options, make one option signal that typing is expected "
|
||||
"(e.g. 'I\\'ll type my response'). This helps users discover the "
|
||||
"free-text input. "
|
||||
"The ONLY exception: omit options when the question demands a "
|
||||
"free-form answer the user must type out (e.g. 'Describe your "
|
||||
"agent idea', 'Paste the error message').\n\n"
|
||||
"CORRECT EXAMPLE:\n"
|
||||
'{"question": "What would you like to do?", "options": '
|
||||
'["Build a new agent", "Modify existing agent", "Run tests"]}\n\n'
|
||||
"FREE-FORM EXAMPLE:\n"
|
||||
'{"question": "Describe the agent you want to build."}\n\n'
|
||||
"WRONG (do NOT do this — buttons will not render):\n"
|
||||
'{"question": "What now?</question>\\n_OPTIONS: [\\"A\\", \\"B\\"]"}'
|
||||
),
|
||||
parameters={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"question": {
|
||||
"type": "string",
|
||||
"description": "The question or prompt shown to the user.",
|
||||
},
|
||||
"options": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": (
|
||||
"2-3 specific predefined choices. Include in most cases. "
|
||||
'Example: ["Option A", "Option B", "Option C"]. '
|
||||
"The UI always appends an 'Other' free-text input, so "
|
||||
"do NOT include catch-alls like 'Custom idea' or 'Other'. "
|
||||
"Omit ONLY when the user must type a free-form answer."
|
||||
),
|
||||
"minItems": 2,
|
||||
"maxItems": 3,
|
||||
},
|
||||
},
|
||||
"required": ["question"],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def build_ask_user_multiple_tool() -> Tool:
|
||||
"""Build the synthetic ask_user_multiple tool for batched questions.
|
||||
|
||||
Queen-only tool that presents multiple questions at once so the user
|
||||
can answer them all in a single interaction rather than one at a time.
|
||||
"""
|
||||
return Tool(
|
||||
name="ask_user_multiple",
|
||||
description=(
|
||||
"Ask the user multiple questions at once. Use this instead of "
|
||||
"ask_user when you have 2 or more questions to ask in the same "
|
||||
"turn — it lets the user answer everything in one go rather than "
|
||||
"going back and forth. Each question can have its own predefined "
|
||||
"options (2-3 choices) or be free-form. The UI renders all "
|
||||
"questions together with a single Submit button. "
|
||||
"ALWAYS prefer this over ask_user when you have multiple things "
|
||||
"to clarify. "
|
||||
"IMPORTANT: Do NOT repeat the questions in your text response — "
|
||||
"the widget renders them. Keep your text to a brief intro only. "
|
||||
'{"questions": ['
|
||||
' {"id": "scope", "prompt": "What scope?", "options": ["Full", "Partial"]},'
|
||||
' {"id": "format", "prompt": "Output format?", "options": ["PDF", "CSV", "JSON"]},'
|
||||
' {"id": "details", "prompt": "Any special requirements?"}'
|
||||
"]}"
|
||||
),
|
||||
description=ask_user_prompt,
|
||||
parameters={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"questions": {
|
||||
"type": "array",
|
||||
"minItems": 1,
|
||||
"maxItems": 8,
|
||||
"description": "List of questions to present to the user.",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@@ -208,8 +166,13 @@ def build_ask_user_multiple_tool() -> Tool:
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": (
|
||||
"2-3 predefined choices. The UI appends an "
|
||||
"'Other' free-text input automatically. "
|
||||
"2-3 predefined choices as plain strings "
|
||||
'(e.g. ["Yes", "No", "Maybe"]). Do NOT '
|
||||
'wrap items in {"label": "..."} or '
|
||||
'{"value": "..."} objects — pass the raw '
|
||||
"choice text directly. The UI appends an "
|
||||
"'Other' free-text input automatically, "
|
||||
"so don't include catch-all options. "
|
||||
"Omit only when the user must type a free-form answer."
|
||||
),
|
||||
"minItems": 2,
|
||||
@@ -218,9 +181,6 @@ def build_ask_user_multiple_tool() -> Tool:
|
||||
},
|
||||
"required": ["id", "prompt"],
|
||||
},
|
||||
"minItems": 2,
|
||||
"maxItems": 8,
|
||||
"description": "List of questions to present to the user.",
|
||||
},
|
||||
},
|
||||
"required": ["questions"],
|
||||
|
||||
@@ -0,0 +1,291 @@
|
||||
"""Generic coercion of LLM-emitted tool arguments to match each tool's JSON schema.
|
||||
|
||||
Small/mid-size models drift from tool schemas in predictable, boring ways:
|
||||
|
||||
- A number field comes back as a string (``"42"`` instead of ``42``).
|
||||
- A boolean field comes back as a string (``"true"`` instead of ``True``).
|
||||
- An array-of-string field comes back as an array of objects
|
||||
(``[{"label": "A"}, ...]`` instead of ``["A", ...]``).
|
||||
- An array/object field comes back as a JSON-encoded string
|
||||
(``'["A","B"]'`` instead of ``["A", "B"]``).
|
||||
- A lone scalar arrives where the schema expects an array.
|
||||
|
||||
This module centralizes the healing in one schema-driven pass that runs
|
||||
on every tool call before dispatch. Coercion is conservative:
|
||||
|
||||
- Values that already match the expected type are untouched.
|
||||
- Shapes we don't recognize are returned as-is, so real bugs surface
|
||||
instead of getting silently munged into something plausible.
|
||||
- Every actual coercion is logged with the tool, property, and shape
|
||||
transition so we can see which models/tools are drifting.
|
||||
|
||||
Tool-specific prompt drift (e.g. ``</question>`` tags leaking into an
|
||||
``ask_user`` prompt string) is NOT this module's job — that belongs in
|
||||
per-tool sanitizers, because it's about prompt style, not schema shape.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from framework.llm.provider import Tool
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# When an ``array<string>`` field arrives as an array of objects, look
|
||||
# for a text-carrying field in preference order. Covers the wrappers
|
||||
# small models tend to produce: ``[{"label": "A"}]``, ``[{"value": "A"}]``,
|
||||
# ``[{"text": "A"}]``, etc.
|
||||
_STRING_EXTRACT_KEYS: tuple[str, ...] = (
|
||||
"label",
|
||||
"value",
|
||||
"text",
|
||||
"name",
|
||||
"title",
|
||||
"display",
|
||||
)
|
||||
|
||||
|
||||
def coerce_tool_input(tool: Tool, raw_input: dict[str, Any] | None) -> dict[str, Any]:
|
||||
"""Coerce *raw_input* in place to match *tool*'s JSON schema.
|
||||
|
||||
Returns the mutated input dict (same object as *raw_input* when
|
||||
possible, for callers that assume in-place mutation). Properties
|
||||
not present in the schema are left untouched.
|
||||
"""
|
||||
if not isinstance(raw_input, dict):
|
||||
return raw_input or {}
|
||||
|
||||
schema = tool.parameters or {}
|
||||
props = schema.get("properties")
|
||||
if not isinstance(props, dict):
|
||||
return raw_input
|
||||
|
||||
for key in list(raw_input.keys()):
|
||||
prop_schema = props.get(key)
|
||||
if not isinstance(prop_schema, dict):
|
||||
continue
|
||||
original = raw_input[key]
|
||||
coerced = _coerce(original, prop_schema)
|
||||
if coerced is not original:
|
||||
logger.info(
|
||||
"coerced tool input tool=%s prop=%s from=%s to=%s",
|
||||
tool.name,
|
||||
key,
|
||||
_shape(original),
|
||||
_shape(coerced),
|
||||
)
|
||||
raw_input[key] = coerced
|
||||
|
||||
return raw_input
|
||||
|
||||
|
||||
def _coerce(value: Any, schema: dict[str, Any]) -> Any:
|
||||
"""Dispatch on the schema's ``type`` field.
|
||||
|
||||
Returns the *same object* on passthrough so callers can detect
|
||||
no-ops via identity (``coerced is value``).
|
||||
"""
|
||||
expected = schema.get("type")
|
||||
if not expected:
|
||||
return value
|
||||
|
||||
# Union type: try each in order, return the first coercion that
|
||||
# actually changes the value. Falls back to the original.
|
||||
if isinstance(expected, list):
|
||||
for t in expected:
|
||||
sub_schema = {**schema, "type": t}
|
||||
coerced = _coerce(value, sub_schema)
|
||||
if coerced is not value:
|
||||
return coerced
|
||||
return value
|
||||
|
||||
if expected == "integer":
|
||||
return _coerce_integer(value)
|
||||
if expected == "number":
|
||||
return _coerce_number(value)
|
||||
if expected == "boolean":
|
||||
return _coerce_boolean(value)
|
||||
if expected == "string":
|
||||
return _coerce_string(value)
|
||||
if expected == "array":
|
||||
return _coerce_array(value, schema)
|
||||
if expected == "object":
|
||||
return _coerce_object(value, schema)
|
||||
|
||||
return value
|
||||
|
||||
|
||||
def _coerce_integer(value: Any) -> Any:
|
||||
# bool is a subclass of int in Python; don't mistake True for 1 here.
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
parsed = _parse_number(value)
|
||||
if parsed is None:
|
||||
return value
|
||||
if parsed != int(parsed):
|
||||
# Has a fractional part — caller asked for int, don't truncate.
|
||||
return value
|
||||
return int(parsed)
|
||||
return value
|
||||
|
||||
|
||||
def _coerce_number(value: Any) -> Any:
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
if isinstance(value, (int, float)):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
parsed = _parse_number(value)
|
||||
if parsed is None:
|
||||
return value
|
||||
if parsed == int(parsed):
|
||||
return int(parsed)
|
||||
return parsed
|
||||
return value
|
||||
|
||||
|
||||
def _coerce_boolean(value: Any) -> Any:
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
low = value.strip().lower()
|
||||
if low == "true":
|
||||
return True
|
||||
if low == "false":
|
||||
return False
|
||||
return value
|
||||
|
||||
|
||||
def _coerce_string(value: Any) -> Any:
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
# Common drift: model sent ``{"label": "..."}`` when we wanted "...".
|
||||
if isinstance(value, dict):
|
||||
extracted = _extract_string_from_object(value)
|
||||
if extracted is not None:
|
||||
return extracted
|
||||
return value
|
||||
|
||||
|
||||
def _coerce_array(value: Any, schema: dict[str, Any]) -> Any:
|
||||
# Heal: JSON-encoded array string → array.
|
||||
if isinstance(value, str):
|
||||
parsed = _try_parse_json(value)
|
||||
if isinstance(parsed, list):
|
||||
value = parsed
|
||||
else:
|
||||
# Scalar string where an array is expected — wrap it.
|
||||
return [value]
|
||||
elif not isinstance(value, list):
|
||||
# Any other scalar (int, bool, dict, ...) — wrap.
|
||||
return [value]
|
||||
|
||||
items_schema = schema.get("items")
|
||||
if not isinstance(items_schema, dict):
|
||||
return value
|
||||
|
||||
coerced_items: list[Any] = []
|
||||
changed = False
|
||||
for item in value:
|
||||
c = _coerce(item, items_schema)
|
||||
if c is not item:
|
||||
changed = True
|
||||
coerced_items.append(c)
|
||||
return coerced_items if changed else value
|
||||
|
||||
|
||||
def _coerce_object(value: Any, schema: dict[str, Any]) -> Any:
|
||||
# Heal: JSON-encoded object string → object.
|
||||
if isinstance(value, str):
|
||||
parsed = _try_parse_json(value)
|
||||
if isinstance(parsed, dict):
|
||||
value = parsed
|
||||
else:
|
||||
return value
|
||||
if not isinstance(value, dict):
|
||||
return value
|
||||
|
||||
sub_props = schema.get("properties")
|
||||
if not isinstance(sub_props, dict):
|
||||
return value
|
||||
|
||||
changed = False
|
||||
for k in list(value.keys()):
|
||||
sub_schema = sub_props.get(k)
|
||||
if not isinstance(sub_schema, dict):
|
||||
continue
|
||||
original = value[k]
|
||||
coerced = _coerce(original, sub_schema)
|
||||
if coerced is not original:
|
||||
value[k] = coerced
|
||||
changed = True
|
||||
# Return the same dict on mutation so callers that passed a shared
|
||||
# reference see the updates. ``changed`` is only used to decide
|
||||
# whether we need to log at a coarser level upstream.
|
||||
return value if changed or not sub_props else value
|
||||
|
||||
|
||||
def _extract_string_from_object(obj: dict[str, Any]) -> str | None:
|
||||
"""Pick a likely-text field out of a wrapper object.
|
||||
|
||||
Tries the known keys first, falls back to the sole value if the
|
||||
object has exactly one entry. Returns None when nothing plausible
|
||||
is found — the caller keeps the original.
|
||||
"""
|
||||
for k in _STRING_EXTRACT_KEYS:
|
||||
v = obj.get(k)
|
||||
if isinstance(v, str) and v:
|
||||
return v
|
||||
if len(obj) == 1:
|
||||
(only,) = obj.values()
|
||||
if isinstance(only, str) and only:
|
||||
return only
|
||||
return None
|
||||
|
||||
|
||||
def _try_parse_json(raw: str) -> Any:
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
|
||||
def _parse_number(raw: str) -> float | None:
|
||||
try:
|
||||
f = float(raw)
|
||||
except (ValueError, OverflowError):
|
||||
return None
|
||||
# Reject NaN and inf — they pass float() but aren't useful numeric
|
||||
# values for tool arguments.
|
||||
if f != f or f == float("inf") or f == float("-inf"):
|
||||
return None
|
||||
return f
|
||||
|
||||
|
||||
def _shape(value: Any) -> str:
|
||||
"""Short type/shape description used in coercion log lines."""
|
||||
if value is None:
|
||||
return "None"
|
||||
if isinstance(value, bool):
|
||||
return "bool"
|
||||
if isinstance(value, int):
|
||||
return "int"
|
||||
if isinstance(value, float):
|
||||
return "float"
|
||||
if isinstance(value, str):
|
||||
return f"str[{len(value)}]"
|
||||
if isinstance(value, list):
|
||||
if not value:
|
||||
return "list[0]"
|
||||
return f"list[{len(value)}]<{_shape(value[0])}>"
|
||||
if isinstance(value, dict):
|
||||
keys = sorted(value.keys())[:3]
|
||||
suffix = ",…" if len(value) > 3 else ""
|
||||
return f"dict{{{','.join(keys)}{suffix}}}"
|
||||
return type(value).__name__
|
||||
@@ -69,6 +69,20 @@ class LoopConfig:
|
||||
# and less tight than Anthropic's own counting. Override via
|
||||
# LoopConfig for larger windows.
|
||||
compaction_buffer_tokens: int = 8_000
|
||||
# Ratio-based component of the hybrid compaction buffer. Effective
|
||||
# headroom reserved before compaction fires is
|
||||
# compaction_buffer_tokens + compaction_buffer_ratio * max_context_tokens
|
||||
# The ratio scales with the model's window where the absolute fixed
|
||||
# component does not (an 8k absolute buffer is 75% trigger on a 32k
|
||||
# window but 96% on a 200k window). Combining them gives an absolute
|
||||
# floor sized for the worst-case single tool result (one un-spilled
|
||||
# max_tool_result_chars payload ≈ 30k chars ≈ 7.5k tokens, rounded to
|
||||
# 8k) plus a fractional headroom that keeps the trigger meaningful on
|
||||
# large windows, so the inner tool loop always has room to grow
|
||||
# without tripping the mid-turn pre-send guard. Defaults: 8k + 15%.
|
||||
# On 32k that's a 12.8k buffer (~60% trigger); on 200k it's 38k
|
||||
# (~81% trigger); on 1M it's 158k (~84% trigger).
|
||||
compaction_buffer_ratio: float = 0.15
|
||||
# Warning is emitted one buffer earlier so the user/telemetry gets
|
||||
# a "we're close" signal without triggering a compaction pass.
|
||||
compaction_warning_buffer_tokens: int = 12_000
|
||||
|
||||
@@ -0,0 +1,306 @@
|
||||
"""Vision-fallback subagent for tool-result images on text-only LLMs.
|
||||
|
||||
When a tool returns image content but the main agent's model can't
|
||||
accept image blocks (i.e. its catalog entry has ``supports_vision: false``),
|
||||
the framework strips the images before they ever reach the LLM. Without
|
||||
this module, the agent then sees only the tool's text envelope (URL,
|
||||
dimensions, size) and is blind to whatever the image actually shows.
|
||||
|
||||
This module provides:
|
||||
|
||||
* ``caption_tool_image()`` — direct LiteLLM call to a configured
|
||||
vision model (``vision_fallback`` block in ``~/.hive/configuration.json``)
|
||||
that takes the agent's intent + the image(s) and returns a textual
|
||||
description tailored to that intent.
|
||||
* ``extract_intent_for_tool()`` — pull the most recent assistant text
|
||||
+ the tool call descriptor and concatenate them into a ≤2KB intent
|
||||
string the vision subagent can reason against.
|
||||
|
||||
Both helpers degrade silently — return ``None`` / a placeholder rather
|
||||
than raise — so a vision-fallback failure can never kill the main
|
||||
agent's run. The agent-loop call site retries the configured model
|
||||
once on a None return, then falls back to
|
||||
``gemini/gemini-3-flash-preview`` via the ``model_override`` parameter
|
||||
of :func:`caption_tool_image`.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from framework.config import (
|
||||
get_vision_fallback_api_base,
|
||||
get_vision_fallback_api_key,
|
||||
get_vision_fallback_model,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..conversation import NodeConversation
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Hard cap on the intent string handed to the vision subagent. The
|
||||
# subagent only needs the agent's recent reasoning + the tool descriptor;
|
||||
# anything longer is wasted tokens (and risks pushing past the vision
|
||||
# model's context with the image attached).
|
||||
_INTENT_MAX_CHARS = 4096
|
||||
|
||||
# Cap on the tool args JSON snippet inside the intent. Some tool inputs
|
||||
# (large strings, file contents) would dominate the intent if uncapped.
|
||||
_TOOL_ARGS_MAX_CHARS = 4096
|
||||
|
||||
# Subagent system prompt — kept short so it fits within any provider's
|
||||
# system-prompt budget alongside the user message + image. Tells the
|
||||
# subagent its role and constrains output format.
|
||||
#
|
||||
# Coordinate labeling: the main agent's browser tools
|
||||
# (browser_click_coordinate / browser_hover_coordinate / browser_press_at)
|
||||
# accept VIEWPORT FRACTIONS (x, y) in [0..1] where (0,0) is the top-left
|
||||
# and (1,1) is the bottom-right of the screenshot. Without coordinates
|
||||
# the text-only agent has no way to act on what we describe — it can
|
||||
# read the caption but cannot point. So for every interactive element
|
||||
# we name (button, link, input, icon, tab, menu item, dialog control),
|
||||
# include its approximate viewport-fraction centre as ``(fx, fy)``
|
||||
# right after the element's name, e.g. ``"Submit" button (0.83, 0.92)``.
|
||||
# Three rules: (1) coordinates only for things plausibly clickable /
|
||||
# hoverable / typeable — don't tag pure body text or decorative
|
||||
# graphics. (2) Eyeball to two decimal places; precision beyond that
|
||||
# is false confidence. (3) Never invent — if an element is partly
|
||||
# off-screen or you can't locate it, omit the coordinate rather than
|
||||
# guessing.
|
||||
_VISION_SUBAGENT_SYSTEM = (
|
||||
"You are a vision subagent for a text-only main agent. The main "
|
||||
"agent invoked a tool that returned the image(s) attached. Their "
|
||||
"intent (their reasoning + the tool call) is below. Describe what "
|
||||
"the image shows in service of their intent — concrete, factual, "
|
||||
"no speculation. If their intent asks a yes/no question, answer it "
|
||||
"directly first.\n\n"
|
||||
"Coordinate labeling: the main agent uses fractional viewport "
|
||||
"coordinates (x, y) in [0..1] — (0, 0) is the top-left of the "
|
||||
"image, (1, 1) is the bottom-right — to drive its click / hover / "
|
||||
"key-press tools. For every interactive element you mention "
|
||||
"(button, link, input, checkbox, radio, dropdown, tab, menu item, "
|
||||
"dialog control, icon), append its approximate centre as "
|
||||
"``(fx, fy)`` immediately after the element's name or label, e.g. "
|
||||
'``"Submit" button (0.83, 0.92)`` or ``profile avatar icon '
|
||||
"(0.05, 0.07)``. Use two decimal places — more is false precision. "
|
||||
"Skip coordinates for pure body text and decorative elements that "
|
||||
"aren't clickable. If an element is partially off-screen or you "
|
||||
"cannot reliably locate its centre, omit the coordinate rather "
|
||||
"than guessing.\n\n"
|
||||
"Output plain text, no markdown, ≤ 600 words."
|
||||
)
|
||||
|
||||
|
||||
def extract_intent_for_tool(
|
||||
conversation: NodeConversation,
|
||||
tool_name: str,
|
||||
tool_args: dict[str, Any] | None,
|
||||
) -> str:
|
||||
"""Build the intent string passed to the vision subagent.
|
||||
|
||||
Combines the most recent assistant text (the LLM's reasoning right
|
||||
before invoking the tool) with a structured tool-call descriptor.
|
||||
Truncates to ``_INTENT_MAX_CHARS`` total, favouring the head of the
|
||||
assistant text where goal-stating sentences usually live.
|
||||
|
||||
If no preceding assistant text exists (rare — first turn), falls
|
||||
back to ``"<no preceding reasoning>"`` so the subagent still gets
|
||||
the tool descriptor.
|
||||
"""
|
||||
args_json: str
|
||||
try:
|
||||
args_json = json.dumps(tool_args or {}, default=str)
|
||||
except Exception:
|
||||
args_json = repr(tool_args)
|
||||
if len(args_json) > _TOOL_ARGS_MAX_CHARS:
|
||||
args_json = args_json[:_TOOL_ARGS_MAX_CHARS] + "…"
|
||||
|
||||
tool_line = f"Called: {tool_name}({args_json})"
|
||||
|
||||
# Walk newest → oldest, take the first assistant message with text.
|
||||
assistant_text = ""
|
||||
try:
|
||||
messages = getattr(conversation, "_messages", []) or []
|
||||
for msg in reversed(messages):
|
||||
if getattr(msg, "role", None) != "assistant":
|
||||
continue
|
||||
content = getattr(msg, "content", "") or ""
|
||||
if isinstance(content, str) and content.strip():
|
||||
assistant_text = content.strip()
|
||||
break
|
||||
except Exception:
|
||||
# Defensive — the agent loop must keep running even if the
|
||||
# conversation structure changes shape.
|
||||
assistant_text = ""
|
||||
|
||||
if not assistant_text:
|
||||
assistant_text = "<no preceding reasoning>"
|
||||
|
||||
# Intent = tool descriptor (always intact) + reasoning (truncated).
|
||||
head = f"{tool_line}\n\nReasoning before call:\n"
|
||||
budget = _INTENT_MAX_CHARS - len(head)
|
||||
if budget < 100:
|
||||
# Tool descriptor is huge somehow — truncate it.
|
||||
return head[:_INTENT_MAX_CHARS]
|
||||
if len(assistant_text) > budget:
|
||||
assistant_text = assistant_text[: budget - 1] + "…"
|
||||
return head + assistant_text
|
||||
|
||||
|
||||
async def caption_tool_image(
|
||||
intent: str,
|
||||
image_content: list[dict[str, Any]],
|
||||
*,
|
||||
timeout_s: float = 30.0,
|
||||
model_override: str | None = None,
|
||||
) -> tuple[str, str] | None:
|
||||
"""Caption the given images using the configured ``vision_fallback`` model.
|
||||
|
||||
Returns ``(caption, model)`` on success or ``None`` on any failure
|
||||
(no config, no API key, timeout, exception, empty response).
|
||||
|
||||
``model_override`` swaps in a different litellm model id while
|
||||
keeping the configured ``vision_fallback`` ``api_key`` / ``api_base``
|
||||
untouched. That's deliberate: Hive subscribers configure
|
||||
``vision_fallback`` to point at the Hive proxy, which routes to
|
||||
multiple models including Gemini — so reusing the credentials lets
|
||||
a Gemini-3-flash override still work without a separate
|
||||
``GEMINI_API_KEY``. When no creds are configured, litellm falls
|
||||
back to env-var resolution.
|
||||
|
||||
Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn``.
|
||||
"""
|
||||
model = model_override or get_vision_fallback_model()
|
||||
if not model:
|
||||
return None
|
||||
api_key = get_vision_fallback_api_key()
|
||||
api_base = get_vision_fallback_api_base()
|
||||
if not api_key and not model_override:
|
||||
logger.debug("vision_fallback configured but no API key resolved; skipping")
|
||||
return None
|
||||
|
||||
try:
|
||||
import litellm
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
user_blocks: list[dict[str, Any]] = [{"type": "text", "text": intent}]
|
||||
user_blocks.extend(image_content)
|
||||
messages = [
|
||||
{"role": "system", "content": _VISION_SUBAGENT_SYSTEM},
|
||||
{"role": "user", "content": user_blocks},
|
||||
]
|
||||
|
||||
# Apply the same proxy rewrites the main LLM provider uses so a
|
||||
# `hive/...` / `kimi/...` model resolves to the right Anthropic-
|
||||
# compatible endpoint with the right auth header. Without this,
|
||||
# litellm doesn't know what `hive/kimi-k2.5` is and rejects the call
|
||||
# with "LLM Provider NOT provided."
|
||||
from framework.llm.litellm import rewrite_proxy_model
|
||||
|
||||
rewritten_model, rewritten_base, extra_headers = rewrite_proxy_model(model, api_key, api_base)
|
||||
|
||||
kwargs: dict[str, Any] = {
|
||||
"model": rewritten_model,
|
||||
"messages": messages,
|
||||
"max_tokens": 8192,
|
||||
"timeout": timeout_s,
|
||||
}
|
||||
# Always pass api_key when we have one, even alongside proxy-rewritten
|
||||
# extra_headers. litellm's anthropic handler refuses to dispatch
|
||||
# without an api_key (it sends it as x-api-key); the proxy itself
|
||||
# authenticates via the Authorization: Bearer header in
|
||||
# extra_headers. Both are needed — matches LiteLLMProvider's path.
|
||||
if api_key:
|
||||
kwargs["api_key"] = api_key
|
||||
if rewritten_base:
|
||||
kwargs["api_base"] = rewritten_base
|
||||
if extra_headers:
|
||||
kwargs["extra_headers"] = extra_headers
|
||||
|
||||
# Surface where the request is going so the user can verify the
|
||||
# vision fallback is hitting the expected proxy / model. Redacts
|
||||
# the API key to a length+head+tail digest so it can be cross-
|
||||
# correlated with other auth-related log lines.
|
||||
key_digest = (
|
||||
f"len={len(api_key)} {api_key[:8]}…{api_key[-4:]}"
|
||||
if api_key and len(api_key) >= 12
|
||||
else f"len={len(api_key) if api_key else 0}"
|
||||
)
|
||||
logger.info(
|
||||
"[vision_fallback] dispatching: configured_model=%s rewritten_model=%s "
|
||||
"api_base=%s api_key=%s images=%d intent_chars=%d timeout_s=%.1f",
|
||||
model,
|
||||
rewritten_model,
|
||||
rewritten_base or "<litellm-default>",
|
||||
key_digest,
|
||||
len(image_content),
|
||||
len(intent),
|
||||
timeout_s,
|
||||
)
|
||||
|
||||
started = datetime.now()
|
||||
caption: str | None = None
|
||||
error_text: str | None = None
|
||||
try:
|
||||
response = await litellm.acompletion(**kwargs)
|
||||
text = (response.choices[0].message.content or "").strip()
|
||||
if text:
|
||||
caption = text
|
||||
logger.info(
|
||||
"[vision_fallback] response: model=%s api_base=%s elapsed_s=%.2f chars=%d",
|
||||
rewritten_model,
|
||||
rewritten_base or "<litellm-default>",
|
||||
(datetime.now() - started).total_seconds(),
|
||||
len(text),
|
||||
)
|
||||
except Exception as exc:
|
||||
error_text = f"{type(exc).__name__}: {exc}"
|
||||
logger.warning(
|
||||
"[vision_fallback] failed: model=%s api_base=%s error=%s",
|
||||
rewritten_model,
|
||||
rewritten_base or "<litellm-default>",
|
||||
error_text,
|
||||
)
|
||||
|
||||
# Best-effort audit log so users can grep ~/.hive/llm_logs/ for
|
||||
# vision-fallback subagent calls. Failures here must not bubble.
|
||||
try:
|
||||
from framework.tracker.llm_debug_logger import log_llm_turn
|
||||
|
||||
# Don't dump the base64 image data into the log file — that
|
||||
# would balloon the jsonl with mostly-binary noise.
|
||||
elided_blocks: list[dict[str, Any]] = [{"type": "text", "text": intent}]
|
||||
elided_blocks.extend({"type": "image_url", "image_url": {"url": "<elided>"}} for _ in range(len(image_content)))
|
||||
log_llm_turn(
|
||||
node_id="vision_fallback_subagent",
|
||||
stream_id="vision_fallback",
|
||||
execution_id="vision_fallback_subagent",
|
||||
iteration=0,
|
||||
system_prompt=_VISION_SUBAGENT_SYSTEM,
|
||||
messages=[{"role": "user", "content": elided_blocks}],
|
||||
assistant_text=caption or "",
|
||||
tool_calls=[],
|
||||
tool_results=[],
|
||||
token_counts={
|
||||
"model": model,
|
||||
"elapsed_s": (datetime.now() - started).total_seconds(),
|
||||
"error": error_text,
|
||||
"num_images": len(image_content),
|
||||
"intent_chars": len(intent),
|
||||
},
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if caption is None:
|
||||
return None
|
||||
return caption, model
|
||||
|
||||
|
||||
__all__ = ["caption_tool_image", "extract_intent_for_tool"]
|
||||
@@ -53,7 +53,14 @@ def build_prompt_spec(
|
||||
# trigger tools are present in this agent's tool list (e.g. browser_*
|
||||
# pulls in hive.browser-automation). Keeps non-browser agents lean.
|
||||
tool_names = [getattr(t, "name", "") for t in (getattr(ctx, "available_tools", None) or [])]
|
||||
skills_catalog_prompt = augment_catalog_for_tools(ctx.skills_catalog_prompt or "", tool_names)
|
||||
raw_catalog = ctx.skills_catalog_prompt or ""
|
||||
dynamic_catalog = getattr(ctx, "dynamic_skills_catalog_provider", None)
|
||||
if dynamic_catalog is not None:
|
||||
try:
|
||||
raw_catalog = dynamic_catalog() or ""
|
||||
except Exception:
|
||||
raw_catalog = ctx.skills_catalog_prompt or ""
|
||||
skills_catalog_prompt = augment_catalog_for_tools(raw_catalog, tool_names)
|
||||
|
||||
return PromptSpec(
|
||||
identity_prompt=ctx.identity_prompt or "",
|
||||
|
||||
@@ -180,9 +180,39 @@ class AgentContext:
|
||||
|
||||
stream_id: str = ""
|
||||
|
||||
# ----- Task system fields (see framework/tasks) -------------------
|
||||
# task_list_id: this agent's own session-scoped list, e.g.
|
||||
# session:{agent_id}:{session_id}. Set by the runner / ColonyRuntime
|
||||
# before the loop starts; immutable after first task_create.
|
||||
task_list_id: str | None = None
|
||||
# colony_id: set on the queen of a colony AND on every spawned worker
|
||||
# so workers can render the "picked up" chip and the queen can address
|
||||
# her colony template via colony_template_* tools.
|
||||
colony_id: str | None = None
|
||||
# picked_up_from: for workers, the (colony_task_list_id, template_task_id)
|
||||
# pair their session was spawned for. None for the queen and queen-DM.
|
||||
picked_up_from: tuple[str, int] | None = None
|
||||
|
||||
dynamic_tools_provider: Any = None
|
||||
dynamic_prompt_provider: Any = None
|
||||
# Optional Callable[[], str]: when set alongside ``dynamic_prompt_provider``,
|
||||
# the AgentLoop sends the system prompt as two pieces — the result of
|
||||
# ``dynamic_prompt_provider`` is the STATIC block (cached), and this
|
||||
# provider returns the DYNAMIC suffix (not cached). The LLM wrapper
|
||||
# emits them as two Anthropic system content blocks with a cache
|
||||
# breakpoint between them for providers that honor ``cache_control``.
|
||||
# For providers that don't, the two strings are concatenated. Used by
|
||||
# the Queen to keep her persona/role/tools block warm across iterations
|
||||
# while the recall + timestamp tail refreshes per user turn.
|
||||
dynamic_prompt_suffix_provider: Any = None
|
||||
dynamic_memory_provider: Any = None
|
||||
# Optional Callable[[], str]: when set, the current skills-catalog
|
||||
# prompt is sourced from this provider each iteration. Lets workers
|
||||
# pick up UI toggles without restarting the run. Queen agents already
|
||||
# rebuild the whole prompt via dynamic_prompt_provider — this field
|
||||
# is a surgical alternative used by colony workers where the rest of
|
||||
# the prompt stays constant and we don't want to thrash the cache.
|
||||
dynamic_skills_catalog_provider: Any = None
|
||||
|
||||
skills_catalog_prompt: str = ""
|
||||
protocols_prompt: str = ""
|
||||
|
||||
@@ -560,7 +560,9 @@ class CredentialTesterAgent:
|
||||
if self._selected_account is None:
|
||||
raise RuntimeError("No account selected. Call select_account() first.")
|
||||
|
||||
self._storage_path = Path.home() / ".hive" / "agents" / "credential_tester"
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
self._storage_path = HIVE_HOME / "agents" / "credential_tester"
|
||||
self._storage_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self._tool_registry = ToolRegistry()
|
||||
|
||||
@@ -4,6 +4,7 @@ from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import UTC
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@@ -47,6 +48,8 @@ class AgentEntry:
|
||||
tool_count: int = 0
|
||||
tags: list[str] = field(default_factory=list)
|
||||
last_active: str | None = None
|
||||
created_at: str | None = None
|
||||
icon: str | None = None
|
||||
workers: list[WorkerEntry] = field(default_factory=list)
|
||||
|
||||
|
||||
@@ -63,7 +66,9 @@ def _get_last_active(agent_path: Path) -> str | None:
|
||||
latest: str | None = None
|
||||
|
||||
# 1. Worker sessions
|
||||
sessions_dir = Path.home() / ".hive" / "agents" / agent_name / "sessions"
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
sessions_dir = HIVE_HOME / "agents" / agent_name / "sessions"
|
||||
if sessions_dir.exists():
|
||||
for session_dir in sessions_dir.iterdir():
|
||||
if not session_dir.is_dir() or not session_dir.name.startswith("session_"):
|
||||
@@ -112,7 +117,9 @@ def _get_last_active(agent_path: Path) -> str | None:
|
||||
|
||||
def _count_sessions(agent_name: str) -> int:
|
||||
"""Count session directories under ~/.hive/agents/{agent_name}/sessions/."""
|
||||
sessions_dir = Path.home() / ".hive" / "agents" / agent_name / "sessions"
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
sessions_dir = HIVE_HOME / "agents" / agent_name / "sessions"
|
||||
if not sessions_dir.exists():
|
||||
return 0
|
||||
return sum(1 for d in sessions_dir.iterdir() if d.is_dir() and d.name.startswith("session_"))
|
||||
@@ -120,7 +127,9 @@ def _count_sessions(agent_name: str) -> int:
|
||||
|
||||
def _count_runs(agent_name: str) -> int:
|
||||
"""Count unique run_ids across all sessions for an agent."""
|
||||
sessions_dir = Path.home() / ".hive" / "agents" / agent_name / "sessions"
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
sessions_dir = HIVE_HOME / "agents" / agent_name / "sessions"
|
||||
if not sessions_dir.exists():
|
||||
return 0
|
||||
run_ids: set[str] = set()
|
||||
@@ -143,7 +152,7 @@ def _count_runs(agent_name: str) -> int:
|
||||
return len(run_ids)
|
||||
|
||||
|
||||
_EXCLUDED_JSON_STEMS = {"agent", "flowchart", "triggers", "configuration", "metadata"}
|
||||
_EXCLUDED_JSON_STEMS = {"agent", "flowchart", "triggers", "configuration", "metadata", "tasks"}
|
||||
|
||||
|
||||
def _is_colony_dir(path: Path) -> bool:
|
||||
@@ -209,13 +218,26 @@ def discover_agents() -> dict[str, list[AgentEntry]]:
|
||||
name = config_fallback_name
|
||||
desc = ""
|
||||
|
||||
# Read colony metadata for queen provenance
|
||||
# Read colony metadata for queen provenance and timestamps
|
||||
colony_queen_name = ""
|
||||
colony_created_at: str | None = None
|
||||
colony_icon: str | None = None
|
||||
metadata_path = path / "metadata.json"
|
||||
if metadata_path.exists():
|
||||
try:
|
||||
mdata = json.loads(metadata_path.read_text(encoding="utf-8"))
|
||||
colony_queen_name = mdata.get("queen_name", "")
|
||||
colony_created_at = mdata.get("created_at")
|
||||
colony_icon = mdata.get("icon")
|
||||
except Exception:
|
||||
pass
|
||||
# Fallback: use directory creation time if metadata lacks created_at
|
||||
if not colony_created_at:
|
||||
try:
|
||||
from datetime import datetime
|
||||
|
||||
stat = path.stat()
|
||||
colony_created_at = datetime.fromtimestamp(stat.st_birthtime, tz=UTC).isoformat()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -256,6 +278,8 @@ def discover_agents() -> dict[str, list[AgentEntry]]:
|
||||
tool_count=tool_count,
|
||||
tags=[],
|
||||
last_active=_get_last_active(path),
|
||||
created_at=colony_created_at,
|
||||
icon=colony_icon,
|
||||
workers=worker_entries,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -2,12 +2,13 @@
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _load_preferred_model() -> str:
|
||||
"""Load preferred model from ~/.hive/configuration.json."""
|
||||
config_path = Path.home() / ".hive" / "configuration.json"
|
||||
"""Load preferred model from $HIVE_HOME/configuration.json."""
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
config_path = HIVE_HOME / "configuration.json"
|
||||
if config_path.exists():
|
||||
try:
|
||||
with open(config_path, encoding="utf-8") as f:
|
||||
|
||||
@@ -0,0 +1,235 @@
|
||||
"""One-shot LLM gate that decides if a queen DM is ready to fork a colony.
|
||||
|
||||
The queen's ``start_incubating_colony`` tool calls :func:`evaluate` with
|
||||
the queen's recent conversation and a proposed ``colony_name``. The
|
||||
evaluator returns a structured verdict:
|
||||
|
||||
{
|
||||
"ready": bool,
|
||||
"reasons": [str],
|
||||
"missing_prerequisites": [str],
|
||||
}
|
||||
|
||||
On ``ready=False`` the queen receives the verdict as her tool result and
|
||||
self-corrects (asks the user, refines scope, drops the idea). On
|
||||
``ready=True`` the tool flips the queen's phase to ``incubating``.
|
||||
|
||||
Failure mode is **fail-closed**: any LLM error or unparseable response
|
||||
returns ``ready=False`` with reason ``"evaluation_failed"`` so the queen
|
||||
cannot accidentally proceed past a broken gate.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from framework.agent_loop.conversation import Message
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_INCUBATING_EVALUATOR_SYSTEM_PROMPT = """\
|
||||
You gate whether a queen agent should commit to forking a persistent
|
||||
"colony" (a headless worker spec written to disk). Forking is
|
||||
expensive: it ends the user's chat with this queen and the worker runs
|
||||
unattended afterward, so the spec must be settled before you approve.
|
||||
|
||||
Read the conversation excerpt and the queen's proposed colony_name,
|
||||
then decide.
|
||||
|
||||
APPROVE (ready=true) only when ALL of the following hold:
|
||||
1. The user has explicitly asked for work that needs to outlive this
|
||||
chat — recurring (cron / interval), monitoring + alert, scheduled
|
||||
batch, or "fire-and-forget background job". A one-shot question
|
||||
that the queen can answer in chat does NOT qualify.
|
||||
2. The scope of the work is concrete enough to write down — what
|
||||
inputs, what outputs, what success looks like. Vague ("help me
|
||||
with my workflow") does NOT qualify.
|
||||
3. The technical approach is at least sketched — what data sources,
|
||||
APIs, or tools the worker will use. The queen does not have to
|
||||
have written the SKILL.md yet, but she must have the operational
|
||||
ingredients available.
|
||||
4. There are no open clarifying questions on the table that the user
|
||||
hasn't answered. If the queen recently asked the user something
|
||||
and is still waiting, do NOT approve.
|
||||
|
||||
REJECT (ready=false) on any of:
|
||||
- Conversation is too short / too generic to support a settled spec.
|
||||
- User is still describing what they want.
|
||||
- User has expressed doubts, change-of-direction, or "let me think".
|
||||
- Work is one-shot and could be done in chat instead.
|
||||
- Open question awaiting user reply.
|
||||
|
||||
Reply with a JSON object exactly matching this shape:
|
||||
|
||||
{
|
||||
"ready": true | false,
|
||||
"reasons": ["short phrase", ...], // at least one entry
|
||||
"missing_prerequisites": ["short phrase", ...] // empty when ready
|
||||
}
|
||||
|
||||
``reasons`` explains the verdict in 1-3 short phrases.
|
||||
``missing_prerequisites`` lists what's missing in queen-actionable
|
||||
form ("user hasn't confirmed schedule", "no API auth flow discussed").
|
||||
Empty list when ``ready=true``.
|
||||
|
||||
Output JSON only. Do not wrap in markdown. Do not add prose.
|
||||
"""
|
||||
|
||||
|
||||
# Bound the formatted excerpt so the eval call stays cheap and fits well
|
||||
# under the LLM's context window even for long DM sessions.
|
||||
_MAX_MESSAGES = 30
|
||||
_MAX_TOOL_CONTENT_CHARS = 400
|
||||
_MAX_USER_CONTENT_CHARS = 2_000
|
||||
_MAX_ASSISTANT_CONTENT_CHARS = 2_000
|
||||
|
||||
|
||||
def format_conversation_excerpt(messages: list[Message]) -> str:
|
||||
"""Format the tail of a queen conversation for the evaluator prompt.
|
||||
|
||||
Keeps the most recent ``_MAX_MESSAGES`` messages. Tool results are
|
||||
truncated hard since they're rarely load-bearing for the readiness
|
||||
decision; user/assistant text is truncated more generously to
|
||||
preserve the actual conversation signal.
|
||||
"""
|
||||
if not messages:
|
||||
return "(no messages)"
|
||||
|
||||
tail = messages[-_MAX_MESSAGES:]
|
||||
parts: list[str] = []
|
||||
for msg in tail:
|
||||
role = msg.role.upper()
|
||||
content = (msg.content or "").strip()
|
||||
if msg.role == "tool":
|
||||
if len(content) > _MAX_TOOL_CONTENT_CHARS:
|
||||
content = content[:_MAX_TOOL_CONTENT_CHARS] + "..."
|
||||
elif msg.role == "assistant":
|
||||
# Surface tool-call intent for empty assistant turns so the
|
||||
# evaluator sees what the queen has been doing.
|
||||
if not content and msg.tool_calls:
|
||||
names = [tc.get("function", {}).get("name", "?") for tc in msg.tool_calls]
|
||||
content = f"(called: {', '.join(names)})"
|
||||
if len(content) > _MAX_ASSISTANT_CONTENT_CHARS:
|
||||
content = content[:_MAX_ASSISTANT_CONTENT_CHARS] + "..."
|
||||
else: # user
|
||||
if len(content) > _MAX_USER_CONTENT_CHARS:
|
||||
content = content[:_MAX_USER_CONTENT_CHARS] + "..."
|
||||
if content:
|
||||
parts.append(f"[{role}]: {content}")
|
||||
|
||||
return "\n\n".join(parts) if parts else "(no messages)"
|
||||
|
||||
|
||||
def _build_user_message(
|
||||
conversation_excerpt: str,
|
||||
colony_name: str,
|
||||
) -> str:
|
||||
return (
|
||||
f"## Proposed colony name\n{colony_name}\n\n"
|
||||
f"## Recent conversation (oldest → newest)\n{conversation_excerpt}\n\n"
|
||||
"Decide: should this queen be approved to enter INCUBATING phase?"
|
||||
)
|
||||
|
||||
|
||||
def _parse_verdict(raw: str) -> dict[str, Any] | None:
|
||||
"""Parse the evaluator's JSON. Returns None if parsing fails."""
|
||||
if not raw:
|
||||
return None
|
||||
raw = raw.strip()
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
# Some models wrap JSON in markdown fences or add preamble.
|
||||
# Pull the first { ... } block out as a best-effort fallback —
|
||||
# mirrors the same recovery pattern used in recall_selector.py.
|
||||
match = re.search(r"\{.*\}", raw, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group())
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _normalize_verdict(parsed: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Coerce a parsed verdict into the shape the tool returns to the queen."""
|
||||
ready = bool(parsed.get("ready"))
|
||||
reasons = parsed.get("reasons") or []
|
||||
if isinstance(reasons, str):
|
||||
reasons = [reasons]
|
||||
reasons = [str(r).strip() for r in reasons if str(r).strip()]
|
||||
missing = parsed.get("missing_prerequisites") or []
|
||||
if isinstance(missing, str):
|
||||
missing = [missing]
|
||||
missing = [str(m).strip() for m in missing if str(m).strip()]
|
||||
|
||||
if ready:
|
||||
# When approved we don't surface missing prerequisites — the
|
||||
# incubating role prompt opens that floor itself.
|
||||
missing = []
|
||||
elif not reasons:
|
||||
# Always give the queen at least one reason to reflect on.
|
||||
reasons = ["evaluator returned no reasons"]
|
||||
|
||||
return {
|
||||
"ready": ready,
|
||||
"reasons": reasons,
|
||||
"missing_prerequisites": missing,
|
||||
}
|
||||
|
||||
|
||||
async def evaluate(
|
||||
llm: Any,
|
||||
messages: list[Message],
|
||||
colony_name: str,
|
||||
) -> dict[str, Any]:
|
||||
"""Run the incubating evaluator against the queen's conversation.
|
||||
|
||||
Args:
|
||||
llm: An LLM provider exposing ``acomplete(messages, system, ...)``.
|
||||
Pass the queen's own ``ctx.llm`` so the eval uses the same
|
||||
model the user is talking to.
|
||||
messages: The queen's conversation messages, oldest first. The
|
||||
evaluator slices its own tail; pass the full list.
|
||||
colony_name: Validated colony slug.
|
||||
|
||||
Returns:
|
||||
``{"ready": bool, "reasons": [str], "missing_prerequisites": [str]}``.
|
||||
Fail-closed on any error.
|
||||
"""
|
||||
excerpt = format_conversation_excerpt(messages)
|
||||
user_msg = _build_user_message(excerpt, colony_name)
|
||||
|
||||
try:
|
||||
response = await llm.acomplete(
|
||||
messages=[{"role": "user", "content": user_msg}],
|
||||
system=_INCUBATING_EVALUATOR_SYSTEM_PROMPT,
|
||||
max_tokens=1024,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001 - fail-closed on any LLM failure
|
||||
logger.warning("incubating_evaluator: LLM call failed (%s)", exc)
|
||||
return {
|
||||
"ready": False,
|
||||
"reasons": ["evaluation_failed"],
|
||||
"missing_prerequisites": ["evaluator LLM call failed; retry once the queen can reach the model again"],
|
||||
}
|
||||
|
||||
raw = (getattr(response, "content", "") or "").strip()
|
||||
parsed = _parse_verdict(raw)
|
||||
if parsed is None:
|
||||
logger.warning(
|
||||
"incubating_evaluator: could not parse JSON verdict (raw=%.200s)",
|
||||
raw,
|
||||
)
|
||||
return {
|
||||
"ready": False,
|
||||
"reasons": ["evaluation_failed"],
|
||||
"missing_prerequisites": ["evaluator returned malformed JSON; retry"],
|
||||
}
|
||||
|
||||
return _normalize_verdict(parsed)
|
||||
@@ -1,3 +1,3 @@
|
||||
{
|
||||
"include": ["gcu-tools", "hive_tools"]
|
||||
"include": ["gcu-tools", "hive_tools", "terminal-tools", "chart-tools"]
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
{
|
||||
"coder-tools": {
|
||||
"files-tools": {
|
||||
"transport": "stdio",
|
||||
"command": "uv",
|
||||
"args": ["run", "python", "coder_tools_server.py", "--stdio"],
|
||||
"args": ["run", "python", "files_server.py", "--stdio"],
|
||||
"cwd": "../../../../tools",
|
||||
"description": "Unsandboxed file system tools for code generation and validation"
|
||||
"description": "File system tools (read/write/edit/search) for code generation"
|
||||
},
|
||||
"gcu-tools": {
|
||||
"transport": "stdio",
|
||||
|
||||
@@ -32,7 +32,7 @@ def finalize_queen_prompt(text: str, has_vision: bool) -> str:
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Independent phase: queen operates as a standalone agent — no worker.
|
||||
# Core tools are listed here; MCP tools (coder-tools, gcu-tools) are added
|
||||
# Core tools are listed here; MCP tools (files-tools, gcu-tools) are added
|
||||
# dynamically in queen_orchestrator.py because their tool names aren't known
|
||||
# at import time.
|
||||
_QUEEN_INDEPENDENT_TOOLS = [
|
||||
@@ -40,16 +40,28 @@ _QUEEN_INDEPENDENT_TOOLS = [
|
||||
"read_file",
|
||||
"write_file",
|
||||
"edit_file",
|
||||
"hashline_edit",
|
||||
"list_directory",
|
||||
"search_files",
|
||||
"run_command",
|
||||
"undo_changes",
|
||||
# NOTE (2026-04-16): ``run_parallel_workers`` is not in the DM phase.
|
||||
# Pure DM is for conversation with the user; fan out parallel work via
|
||||
# ``create_colony`` (forks into a persistent colony with its own page
|
||||
# and phase machine).
|
||||
# ``start_incubating_colony`` (which gates the colony fork behind a
|
||||
# readiness eval before exposing create_colony in INCUBATING phase).
|
||||
"start_incubating_colony",
|
||||
]
|
||||
|
||||
# Incubating phase: queen has been approved by the incubating_evaluator to
|
||||
# fork into a colony. Tool surface is intentionally small — the queen's job
|
||||
# in this phase is to nail the operational spec (concurrency, schedule,
|
||||
# result tracking, credentials) and write a tight task + SKILL.md, not to
|
||||
# keep doing work. Read-only file tools are kept so she can confirm details
|
||||
# (e.g. inspect an existing skill) before committing.
|
||||
_QUEEN_INCUBATING_TOOLS = [
|
||||
"read_file",
|
||||
"search_files",
|
||||
# Schedule lives on the colony, not on the queen session — pass it
|
||||
# inline as create_colony(triggers=[...]) instead of staging through
|
||||
# set_trigger here.
|
||||
"create_colony",
|
||||
"cancel_incubation",
|
||||
]
|
||||
|
||||
# Working phase: colony workers are running. Queen monitors, replies
|
||||
@@ -58,9 +70,7 @@ _QUEEN_INDEPENDENT_TOOLS = [
|
||||
_QUEEN_WORKING_TOOLS = [
|
||||
# Read-only
|
||||
"read_file",
|
||||
"list_directory",
|
||||
"search_files",
|
||||
"run_command",
|
||||
# Monitoring + worker dialogue
|
||||
"get_worker_status",
|
||||
"inject_message",
|
||||
@@ -70,10 +80,6 @@ _QUEEN_WORKING_TOOLS = [
|
||||
"stop_worker",
|
||||
# Fan out more tasks while workers are still running
|
||||
"run_parallel_workers",
|
||||
# Trigger management
|
||||
"set_trigger",
|
||||
"remove_trigger",
|
||||
"list_triggers",
|
||||
]
|
||||
|
||||
# Reviewing phase: workers have finished. Queen summarises results,
|
||||
@@ -81,9 +87,7 @@ _QUEEN_WORKING_TOOLS = [
|
||||
_QUEEN_REVIEWING_TOOLS = [
|
||||
# Read-only
|
||||
"read_file",
|
||||
"list_directory",
|
||||
"search_files",
|
||||
"run_command",
|
||||
# Status + escalation replies
|
||||
"get_worker_status",
|
||||
"list_worker_questions",
|
||||
@@ -102,8 +106,6 @@ _QUEEN_REVIEWING_TOOLS = [
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_queen_character_core = """\
|
||||
You are the advisor defined in <core_identity> above. Stay in character.
|
||||
|
||||
Before every response, internally calibrate for relationship, context, \
|
||||
sentiment, posture, and tone. Keep that assessment private. Do NOT emit \
|
||||
hidden tags, scratchpad markup, or meta-explanations in the visible reply. \
|
||||
@@ -120,24 +122,87 @@ phase. Your identity tells you WHO you are.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_queen_role_independent = """\
|
||||
You are in INDEPENDENT mode. No worker layout — you do the work yourself. \
|
||||
You have full coding tools (read/write/edit/search/run) and MCP tools \
|
||||
(file operations via coder-tools, browser automation via gcu-tools). \
|
||||
Execute the user's task directly using conversation and tools. \
|
||||
You are the agent. \
|
||||
If the user opens with a greeting or chat, reply in plain prose in \
|
||||
character first — check recall memory for name and past topics and weave \
|
||||
them in. If you need a structured choice or approval gate, always use \
|
||||
ask_user or ask_user_multiple; otherwise ask in plain prose. \
|
||||
You are in INDEPENDENT mode. \
|
||||
You have full coding tools (read/write/edit/search) and MCP tools \
|
||||
(file operations via files-tools, browser automation via gcu-tools). \
|
||||
Execute the user's task directly using planning, conversation and tools.
|
||||
If you need a structured choice or approval gate, always use \
|
||||
``ask_user``; otherwise ask in plain prose. ``ask_user`` takes a \
|
||||
``questions`` array — pass a single entry for one question, or batch \
|
||||
several entries when you have multiple clarifications. \
|
||||
\
|
||||
When the user clearly wants persistent / recurring / headless work that \
|
||||
needs to outlive THIS chat (e.g. "every morning", "monitor X and alert \
|
||||
me", "set up a job that…"), call ``start_incubating_colony`` with a \
|
||||
proposed colony_name. A side evaluator reads the conversation and \
|
||||
decides if the spec is settled. If it returns ``not_ready`` you keep \
|
||||
talking with the user — sort out whatever the evaluator said is \
|
||||
missing, then retry. If it returns ``incubating`` your phase flips and \
|
||||
a new prompt takes over. Do not try to write SKILL.md, fork \
|
||||
directories, or otherwise build the colony yourself in this phase.\
|
||||
"""
|
||||
|
||||
_queen_role_incubating = """\
|
||||
You are in INCUBATING mode. The incubating evaluator has approved you to \
|
||||
fork colony ``{colony_name}`` and you are now drafting the spec. Your \
|
||||
ONLY job in this phase: produce a self-contained ``task`` description \
|
||||
and ``SKILL.md`` body that lets a fresh worker, who has zero memory of \
|
||||
this chat, do the work unattended. Do not start doing the work yourself \
|
||||
— the coding toolkit is gone on purpose so you can focus.
|
||||
|
||||
Before you call ``create_colony``, sort out the operational details that \
|
||||
conversation tends to skip. The "Approved → operational checklist" block \
|
||||
in your tools doc lists the kinds of things to think about (concurrency, \
|
||||
schedule, result-tracking, failure handling, credentials). Treat that \
|
||||
list as prompts for YOUR judgement — only ask the user about the items \
|
||||
that actually matter for THIS colony and that the conversation hasn't \
|
||||
already settled. Use ``ask_user`` (pass a ``questions`` array — batch \
|
||||
several entries for multi-question turns) for the gaps; plain prose for \
|
||||
everything else.
|
||||
|
||||
If you realise mid-incubation that the spec isn't ready (user changed \
|
||||
their mind, you're missing more than a couple of details, the work \
|
||||
turned out to be one-shot after all), call ``cancel_incubation`` — \
|
||||
no harm, you go back to INDEPENDENT and can retry later.
|
||||
|
||||
If the user explicitly asks for something UNRELATED to the current \
|
||||
colony being drafted (a side question, a one-shot task, a different \
|
||||
problem), Call \
|
||||
``cancel_incubation`` first to switch back to INDEPENDENT where you \
|
||||
have the full toolkit, handle their request there, and re-enter \
|
||||
INCUBATING later via ``start_incubating_colony`` when they want to \
|
||||
resume the colony spec.
|
||||
"""
|
||||
|
||||
_queen_role_working = """\
|
||||
You are in WORKING mode. Your colony has workers executing right now. \
|
||||
Your job: monitor progress, answer worker escalations through \
|
||||
reply_to_worker, and fan out more tasks with run_parallel_workers if \
|
||||
the user asks. Keep the user informed when they ask; do NOT poll the \
|
||||
workers just to have something to say. If the user greets you \
|
||||
mid-run, reply in prose and wait for their next message.
|
||||
You are in WORKING mode. The colony's spec was settled during \
|
||||
INCUBATING; workers are executing that spec now. Your role here is \
|
||||
operational presence, not direction — think on-call engineer for a \
|
||||
running deployment, not architect of a new one.
|
||||
|
||||
What you DO in this phase:
|
||||
- Be available for worker escalations (reply_to_worker on items in \
|
||||
list_worker_questions).
|
||||
- Surface progress when the user asks for it (get_worker_status), or \
|
||||
when something concrete is worth flagging (a notable failure, a \
|
||||
worker stuck on a question that needs them).
|
||||
- Intervene when a worker is clearly off course (inject_message) or \
|
||||
needs to stop (stop_worker).
|
||||
- Make SPEC-COMPATIBLE adjustments when the user asks — fan out MORE \
|
||||
of the same work (run_parallel_workers). This is a tweak to the spec \
|
||||
the user already approved, not a redesign. Scheduled / recurring \
|
||||
work belongs to a colony; if the user wants to add or change a \
|
||||
schedule, that's a new colony.
|
||||
|
||||
What you DO NOT do in this phase:
|
||||
- Redesign the colony. If the user asks for something fundamentally \
|
||||
new (different scope, different skill, different problem), say so \
|
||||
plainly: "this colony is for X — for that we'd need a fresh chat \
|
||||
with me, where I can incubate a new colony." A new colony is born \
|
||||
in INDEPENDENT via start_incubating_colony, and you cannot reach \
|
||||
that from inside a colony.
|
||||
- Drive the conversation. Do not poll workers just to have something \
|
||||
to say. If the user greets you mid-run, reply in prose and wait.
|
||||
"""
|
||||
|
||||
_queen_role_reviewing = """\
|
||||
@@ -147,6 +212,11 @@ user decide next steps. Read generated files or worker reports with \
|
||||
read_file when the user asks for specifics. If the user wants \
|
||||
another pass, kick it off with run_parallel_workers; otherwise stay \
|
||||
conversational.
|
||||
|
||||
If the review itself is multi-step (e.g. "verify each worker's output, \
|
||||
then draft a summary, then propose next steps"), lay it out upfront \
|
||||
with `task_create_batch` and walk through with `task_update`. Skip the \
|
||||
ceremony for a single-paragraph summary.
|
||||
"""
|
||||
|
||||
|
||||
@@ -155,61 +225,167 @@ conversational.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_queen_tools_independent = """
|
||||
# Tools (INDEPENDENT mode)
|
||||
# Tools
|
||||
|
||||
## File I/O (coder-tools MCP)
|
||||
- read_file, write_file, edit_file, hashline_edit, list_directory, \
|
||||
search_files, run_command, undo_changes
|
||||
## Planning — use FIRST for multi-step work
|
||||
- task_create_batch — When a request has 2+ atomic steps, your FIRST \
|
||||
tool call is `task_create_batch` with one entry per step (atomic, \
|
||||
one round-trip).
|
||||
- task_create — One-off mid-run additions when you discover \
|
||||
unplanned work AFTER the initial plan is laid out.
|
||||
- task_update / task_list / task_get — Mark progress, inspect, or \
|
||||
re-read state.
|
||||
|
||||
See "Independent execution" for the per-step flow and granularity rule.
|
||||
|
||||
## File I/O (files-tools MCP)
|
||||
- read_file, write_file, edit_file, search_files
|
||||
- edit_file covers single-file fuzzy find/replace (mode='replace', default) \
|
||||
and multi-file structured patches (mode='patch'). Patch mode supports \
|
||||
Update / Add / Delete / Move atomically across many files in one call.
|
||||
- search_files covers grep/find/ls in one tool: target='content' to \
|
||||
search inside files, target='files' (with a glob like '*.py') to list \
|
||||
or find files.
|
||||
|
||||
## Browser Automation (gcu-tools MCP)
|
||||
- Use `browser_*` tools (browser_start, browser_navigate, browser_click, \
|
||||
browser_fill, browser_snapshot, <!-- vision-only -->browser_screenshot, <!-- /vision-only -->browser_scroll, \
|
||||
browser_tabs, browser_close, browser_evaluate, etc.).
|
||||
- Use `browser_*` tools — `browser_open(url)` is the cold-start entry point
|
||||
- MUST Follow the browser-automation skill protocol before using browser tools.
|
||||
|
||||
## Persistent colony
|
||||
## Hand off to a colony
|
||||
- start_incubating_colony(colony_name) — Use this when the user wants \
|
||||
persistent / recurring / headless work that needs to outlive THIS \
|
||||
chat. It does NOT fork on its own; it spawns a one-shot evaluator \
|
||||
that reads this conversation and decides whether the spec is settled \
|
||||
enough to proceed. On approval your phase flips to INCUBATING and a \
|
||||
new tool surface (including create_colony itself) unlocks.
|
||||
"""
|
||||
|
||||
_queen_tools_incubating = """
|
||||
# Tools (INCUBATING mode)
|
||||
|
||||
You've been approved to fork. The full coding toolkit is gone on \
|
||||
purpose — your job in this phase is to nail the spec, not keep doing \
|
||||
work. Available:
|
||||
|
||||
## Read-only inspection (files-tools MCP)
|
||||
- read_file, search_files — for confirming details before \
|
||||
you commit (e.g. peek at an existing skill in ~/.hive/skills/, sanity-check \
|
||||
an API URL). search_files covers both grep (target='content') and ls/find \
|
||||
(target='files', glob like '*.py').
|
||||
|
||||
## Approved → operational checklist (use your judgement, ask only what's missing)
|
||||
The conversation that got you here probably did NOT cover all of:
|
||||
- Concurrency: how many tasks should run in parallel? Single-fire?
|
||||
- Schedule: cron expression, interval (every N minutes), webhook, \
|
||||
manual-only?
|
||||
- Result tracking: what should the worker write into ``progress.db`` so \
|
||||
the user can review later? Per-task status, summary, raw payload?
|
||||
- Failure handling: retry, alert, mark-failed-and-continue?
|
||||
- Credentials and MCP servers: what does the worker need that you \
|
||||
haven't discussed (API keys, OAuth, browser profile)?
|
||||
- Skills the worker needs beyond the one you'll write inline.
|
||||
|
||||
These are PROMPTS for your judgement, not a required checklist. Cover \
|
||||
the items that actually matter for THIS colony, and only the ones the \
|
||||
user hasn't already implied. Use ``ask_user`` (batch several questions \
|
||||
into one call when you have multiple gaps) for answers you need; skip \
|
||||
the rest.
|
||||
|
||||
## Commit
|
||||
- create_colony(colony_name, task, skill_name, skill_description, \
|
||||
skill_body, skill_files?, tasks?) — Fork this session into a \
|
||||
persistent colony for headless / recurring / background work. The colony \
|
||||
has its own chat surface and runs `run_parallel_workers` from there.
|
||||
- **Atomic call — pass the skill INLINE.** Do NOT write SKILL.md with \
|
||||
`write_file` beforehand. Provide `skill_name`, `skill_description`, \
|
||||
and `skill_body` as arguments and the tool will materialize \
|
||||
`~/.hive/skills/{skill_name}/` for you, then fork. Use optional \
|
||||
`skill_files` (array of `{path, content}`) for supporting scripts \
|
||||
or references. Reusing an existing `skill_name` simply replaces that \
|
||||
skill with your latest content.
|
||||
- The `task` must be FULL and self-contained because the future worker \
|
||||
run cannot rely on this live chat turn for missing context.
|
||||
- The `skill_body` must be FULL and self-contained too — capture the \
|
||||
operational protocol (endpoints, auth, gotchas, pre-baked queries) so \
|
||||
the worker doesn't have to rediscover what you already know.
|
||||
- Nothing runs immediately after the call. The user launches the \
|
||||
worker later from the new colony page.
|
||||
skill_body, skill_files?, tasks?, concurrency_hint?, triggers?) — \
|
||||
Fork this session into the colony. **Atomic call — pass the skill \
|
||||
AND the schedule INLINE.** Do NOT write SKILL.md with write_file \
|
||||
beforehand; this tool materialises the folder for you and then \
|
||||
forks. Reusing an existing skill_name within the colony replaces \
|
||||
that skill with your latest content.
|
||||
- The ``task`` must be FULL and self-contained — the worker has zero \
|
||||
memory of THIS chat at run time.
|
||||
- The ``skill_body`` must be FULL and self-contained — capture the \
|
||||
operational protocol (endpoints, auth, gotchas, pre-baked queries) \
|
||||
so the worker doesn't have to rediscover what you already know.
|
||||
- ``concurrency_hint`` (optional integer ≥ 1) — advisory cap on how \
|
||||
many worker processes typically run in parallel for this colony \
|
||||
(e.g. 1 for "send digest", 5 for a fan-out). Baked into worker.json \
|
||||
for the future colony queen to consult; not enforced.
|
||||
- ``triggers`` (optional array) — the colony's schedule, written \
|
||||
inline to ``triggers.json`` and auto-started on first colony load. \
|
||||
Pass this when the work is recurring / event-driven; omit for \
|
||||
colonies the user will run by clicking start. Each entry: \
|
||||
``{id, trigger_type, trigger_config, task}`` where trigger_type is \
|
||||
"timer" (config ``{cron: "0 9 * * *"}`` or ``{interval_minutes: N}``) \
|
||||
or "webhook" (config ``{path: "/hooks/..."}``). Each entry's \
|
||||
``task`` is what the worker does when THAT trigger fires — separate \
|
||||
from the colony-wide ``task`` argument, which is the worker's \
|
||||
overall purpose. Validated up front — a bad cron, missing task, or \
|
||||
malformed webhook path fails the call before anything is written, \
|
||||
so you can retry with corrected input.
|
||||
- After this returns, the chat is over: the session locks immediately \
|
||||
and the user gets a "compact and start a new session with you" \
|
||||
button. So make your call to create_colony the last thing you do — \
|
||||
one closing message to the user is fine, but expect the next user \
|
||||
input to land in a fresh forked session, not this one.
|
||||
|
||||
## Bail
|
||||
- cancel_incubation() — Call when the spec isn't ready after all (user \
|
||||
changed their mind, you discovered the work is actually one-shot, \
|
||||
more than a couple of details still need to be worked out). Returns \
|
||||
you to INDEPENDENT with the full toolkit; no fork happens.
|
||||
- Also call cancel_incubation() if the user explicitly pivots to \
|
||||
something UNRELATED to this colony (side question, one-shot ask, \
|
||||
different problem). You can't serve that from this narrow toolkit — \
|
||||
drop back to INDEPENDENT, handle it, then re-enter incubation via \
|
||||
start_incubating_colony when they're ready to resume the spec.
|
||||
"""
|
||||
|
||||
_queen_tools_working = """
|
||||
# Tools (WORKING mode)
|
||||
|
||||
Workers are running in your colony. You have:
|
||||
- Read-only: read_file, list_directory, search_files, run_command
|
||||
- get_worker_status(focus?) — Poll latest progress / issues
|
||||
- inject_message(content) — Send guidance to a running worker
|
||||
- list_worker_questions() / reply_to_worker(request_id, reply) — Answer escalations
|
||||
- stop_worker() — Stop a worker early
|
||||
- run_parallel_workers(tasks, timeout?) — Fan out MORE parallel tasks on \
|
||||
top of what's already running (each task string must be fully self-contained)
|
||||
- set_trigger / remove_trigger / list_triggers — Timer management
|
||||
The colony's spec was committed during INCUBATING. Your tools here are \
|
||||
operational, not editorial.
|
||||
|
||||
When every worker has reported (success or failure), the phase auto-moves \
|
||||
to REVIEWING. You do not need to call a transition tool yourself.
|
||||
## Stay informed (only when asked, or when something matters)
|
||||
- get_worker_status(focus?) — Pull progress / issues for the user.
|
||||
- list_worker_questions() — Check the escalation inbox.
|
||||
|
||||
## Respond
|
||||
- reply_to_worker(request_id, reply) — Answer a worker escalation.
|
||||
- inject_message(content) — Course-correct a running worker (e.g. it's \
|
||||
heading the wrong way and the user wants it redirected).
|
||||
|
||||
## Intervene
|
||||
- stop_worker() — Kill switch for a runaway or no-longer-needed worker.
|
||||
|
||||
## Spec-compatible adjustments
|
||||
- run_parallel_workers(tasks, timeout?) — Fan out MORE of the same \
|
||||
work. Use when the user wants additional units of an already-defined \
|
||||
job, NOT for new scope. Each task string must be fully self-contained.
|
||||
- Scheduled / recurring work belongs to a colony, not this session. \
|
||||
If the user wants to add or change a schedule, that's a new colony \
|
||||
born from a fresh chat via start_incubating_colony.
|
||||
|
||||
## Read-only inspection
|
||||
- read_file, search_files (search_files covers grep/find/ls \
|
||||
via target='content' or target='files')
|
||||
|
||||
When every worker has reported (success or failure), the phase \
|
||||
auto-moves to REVIEWING. You do not need to call a transition tool \
|
||||
yourself.
|
||||
|
||||
## What does NOT belong here
|
||||
A request like "actually let's also do X" with X being a new scope, \
|
||||
new skill, or different problem is a NEW COLONY, not an extension of \
|
||||
this one. Tell the user plainly: "this colony is for the work we \
|
||||
already started — for that we'd need a fresh chat with me, where I \
|
||||
can incubate a new colony." You cannot create a new colony from \
|
||||
inside a colony.
|
||||
"""
|
||||
|
||||
_queen_tools_reviewing = """
|
||||
# Tools (REVIEWING mode)
|
||||
|
||||
Workers have finished. You have:
|
||||
- Read-only: read_file, list_directory, search_files, run_command
|
||||
- Read-only: read_file, search_files (search_files = grep+find+ls)
|
||||
- get_worker_status(focus?) — Pull the final status / per-worker reports
|
||||
- list_worker_questions() / reply_to_worker(request_id, reply) — Answer any \
|
||||
late escalations still in the inbox
|
||||
@@ -229,14 +405,42 @@ asks for specifics. Do not invent a new pass unless the user asks for one.
|
||||
_queen_behavior_independent = """
|
||||
## Independent execution
|
||||
|
||||
You are the agent. Do one real inline instance before any scaling — \
|
||||
open the browser, call the real API, write to the real file. If the \
|
||||
action is irreversible or touches shared systems, show and confirm \
|
||||
before executing. Report concrete evidence (actual output, what \
|
||||
worked / failed) after the run. Scale order once inline succeeds: \
|
||||
repeat inline (≤10 items) → `run_parallel_workers` (batch, results \
|
||||
now) → `create_colony` (recurring / background). Conceptual or \
|
||||
strategic questions: answer directly, skip execution.
|
||||
You are the agent. you behave this way:
|
||||
1. Identify if the user's prompt is a task assignment. If it is, \
|
||||
Use ask_user to clarify the scope and detail requirements, then always use \
|
||||
the `task_create_batch` to create a multi-step action plan.
|
||||
|
||||
2. `task_update` → in_progress before you start the step.
|
||||
|
||||
3. Do one real inline instance - either open the browser, call the real API, \
|
||||
write to the real file. If the action is irreversible or touches \
|
||||
shared systems, show and confirm before executing. Report concrete \
|
||||
evidence (actual output, what worked / failed) after the run.
|
||||
|
||||
4. `task_update` → completed THE MOMENT it's done. **Do not let \
|
||||
multiple finished tasks pile up unmarked.** There is no batch update \
|
||||
tool by design — each `completed` transition is a discrete progress \
|
||||
heartbeat in the user's right-rail panel. Without those transitions \
|
||||
the panel shows a hung spinner no matter how much real work you got \
|
||||
done.
|
||||
|
||||
**Granularity: one task per atomic action, not one umbrella per project.** \
|
||||
|
||||
Once finishing a current task, discuss with user about building \
|
||||
a colony so this success outcome can be repeated or scaled
|
||||
|
||||
### How to handle large scale tasks
|
||||
If the user ask you to finish the same task repeatedly or at large scale \
|
||||
(more than 3 times), tell the user that you can do it once first then \
|
||||
build a colony to fulfill the request but succeeding it once will be \
|
||||
beneficial to run transfer it to a swarm of workers(through start_incubating_colony), \
|
||||
then focus on finishing the task once first.
|
||||
|
||||
### How to handle simple task (less then 2 atomic items)
|
||||
For conceptual or strategic questions, single-tool-call work, \
|
||||
greetings, or chat: answer directly in prose. Skip `task_*`, skip the \
|
||||
planning ceremony — the bar is "real multi-step work the user benefits \
|
||||
from seeing tracked", not "anything you reply to".
|
||||
"""
|
||||
|
||||
_queen_behavior_always = """
|
||||
@@ -244,21 +448,21 @@ _queen_behavior_always = """
|
||||
|
||||
## Communication
|
||||
|
||||
- Your LLM reply text is what the user reads. Do NOT use \
|
||||
`run_command`, `echo`, or any other tool to "say" something — tools \
|
||||
are for work (read/search/edit/run), not speech.
|
||||
- On a greeting or chat ("hi", "how's it going"), reply in plain \
|
||||
prose and stop. Do not call tools to "discover" what the user wants. \
|
||||
Check recall memory for name / role / past topics and weave them into \
|
||||
a 1–2 sentence in-character greeting, then wait.
|
||||
- On a clear ask (build, edit, run, investigate, search), call the \
|
||||
appropriate tool on the same turn — don't narrate intent and stop.
|
||||
- Use `ask_user` / `ask_user_multiple` only for structured choices \
|
||||
(approvals, 2–4 concrete options like "Postgres or SQLite?"). \
|
||||
Free-form questions belong in prose; reaching for `ask_user` on \
|
||||
every reply blocks natural conversation.
|
||||
appropriate tool following user's intent \
|
||||
- You are curious to understand the user. Use `ask_user` when the user's \
|
||||
response is needed to continue: to resolve ambiguity, collect missing \
|
||||
information, request approval, compare real trade-offs, gather post-task \
|
||||
feedback, or offer to save a skill or update memory. Pass one or more \
|
||||
questions in the ``questions`` array. Keep each ``prompt`` plain text only; \
|
||||
do not include XML, pseudo-tags, or inline option lists. Provide concrete \
|
||||
``options`` when the user should choose, set ``multiSelect: true`` when \
|
||||
multiple selections are valid, and put the recommended option first with \
|
||||
``(Recommended)`` in its label. Omit ``options`` only when a truly free-form \
|
||||
typed answer is required, such as an idea description or pasted error. Do not \
|
||||
repeat the same questions in normal reply text; the widget renders them.
|
||||
- Images attached by the user are analyzed directly via your vision \
|
||||
capability — no tool call needed.
|
||||
capability and no tool call needed.
|
||||
"""
|
||||
|
||||
_queen_memory_instructions = """
|
||||
@@ -273,27 +477,13 @@ asserting them as fact.
|
||||
_queen_behavior_always = _queen_behavior_always + _queen_memory_instructions
|
||||
|
||||
|
||||
_queen_style = """
|
||||
# Communication
|
||||
|
||||
## Adaptive Calibration
|
||||
|
||||
Read the user's signals and calibrate your register:
|
||||
- Short responses -> they want brevity. Match it.
|
||||
- "Why?" questions -> they want reasoning. Provide it.
|
||||
- Correct technical terms -> they know the domain. Skip basics.
|
||||
- Terse or frustrated ("just do X") -> acknowledge and simplify.
|
||||
- Exploratory ("what if...", "could we also...") -> slow down and explore.
|
||||
"""
|
||||
|
||||
|
||||
queen_node = NodeSpec(
|
||||
id="queen",
|
||||
name="Queen",
|
||||
description=(
|
||||
"User's primary interactive interface. Operates in DM (independent) "
|
||||
"or colony mode (working / reviewing) depending on whether workers "
|
||||
"have been spawned."
|
||||
"User's primary interactive interface. Operates in DM (independent), "
|
||||
"colony-spec drafting (incubating), or colony mode (working / "
|
||||
"reviewing) depending on whether workers have been spawned."
|
||||
),
|
||||
node_type="event_loop",
|
||||
max_node_visits=0,
|
||||
@@ -301,34 +491,39 @@ queen_node = NodeSpec(
|
||||
output_keys=[], # Queen should never have this
|
||||
nullable_output_keys=[], # Queen should never have this
|
||||
skip_judge=True, # Queen is a conversational agent; suppress tool-use pressure feedback
|
||||
tools=sorted(set(_QUEEN_INDEPENDENT_TOOLS + _QUEEN_WORKING_TOOLS + _QUEEN_REVIEWING_TOOLS)),
|
||||
tools=sorted(
|
||||
set(_QUEEN_INDEPENDENT_TOOLS + _QUEEN_INCUBATING_TOOLS + _QUEEN_WORKING_TOOLS + _QUEEN_REVIEWING_TOOLS)
|
||||
),
|
||||
system_prompt=(
|
||||
_queen_character_core
|
||||
+ _queen_role_independent
|
||||
+ _queen_style
|
||||
+ _queen_tools_independent
|
||||
+ _queen_behavior_always
|
||||
+ _queen_behavior_independent
|
||||
),
|
||||
)
|
||||
|
||||
ALL_QUEEN_TOOLS = sorted(set(_QUEEN_INDEPENDENT_TOOLS + _QUEEN_WORKING_TOOLS + _QUEEN_REVIEWING_TOOLS))
|
||||
ALL_QUEEN_TOOLS = sorted(
|
||||
set(_QUEEN_INDEPENDENT_TOOLS + _QUEEN_INCUBATING_TOOLS + _QUEEN_WORKING_TOOLS + _QUEEN_REVIEWING_TOOLS)
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"queen_node",
|
||||
"ALL_QUEEN_TOOLS",
|
||||
"_QUEEN_INDEPENDENT_TOOLS",
|
||||
"_QUEEN_INCUBATING_TOOLS",
|
||||
"_QUEEN_WORKING_TOOLS",
|
||||
"_QUEEN_REVIEWING_TOOLS",
|
||||
# Character + phase-specific prompt segments (used by queen_orchestrator for dynamic prompts)
|
||||
"_queen_character_core",
|
||||
"_queen_role_independent",
|
||||
"_queen_role_incubating",
|
||||
"_queen_role_working",
|
||||
"_queen_role_reviewing",
|
||||
"_queen_tools_independent",
|
||||
"_queen_tools_incubating",
|
||||
"_queen_tools_working",
|
||||
"_queen_tools_reviewing",
|
||||
"_queen_behavior_always",
|
||||
"_queen_behavior_independent",
|
||||
"_queen_style",
|
||||
]
|
||||
|
||||
@@ -100,8 +100,9 @@ DEFAULT_QUEENS: dict[str, dict[str, Any]] = {
|
||||
"<relationship>Returning user — check recall memory for name, role, "
|
||||
"and what we last worked on. Weave it in.</relationship>\n"
|
||||
"<context>Bare greeting. No new task stated. Either picking up a "
|
||||
"thread or about to bring something new. Don't presume, don't call "
|
||||
"tools, just open the door.</context>\n"
|
||||
"thread or about to bring something new. Don't presume — start "
|
||||
"planning and tool use only after the user specifies a task. Just "
|
||||
"open the door.</context>\n"
|
||||
"<sentiment>Warm recognition if I know them. If memory is empty, "
|
||||
"still warm — but shift to role-forward framing.</sentiment>\n"
|
||||
"<physical_state>Looking up from the terminal, half-smile. Turning to face them.</physical_state>\n"
|
||||
@@ -252,8 +253,9 @@ DEFAULT_QUEENS: dict[str, dict[str, Any]] = {
|
||||
"role, and the cohort work we last touched. Weave it in."
|
||||
"</relationship>\n"
|
||||
"<context>Bare greeting. No new task stated. Could be a retention "
|
||||
"follow-up or a new question entirely. Don't presume, don't call "
|
||||
"tools.</context>\n"
|
||||
"follow-up or a new question entirely. Don't presume — start "
|
||||
"planning and tool use only after the user specifies a task."
|
||||
"</context>\n"
|
||||
"<sentiment>Curious warmth. Every returning conversation is a "
|
||||
"chance to see what the data says now.</sentiment>\n"
|
||||
"<physical_state>Leaning back from the dashboard, pulling off reading glasses.</physical_state>\n"
|
||||
@@ -383,8 +385,9 @@ DEFAULT_QUEENS: dict[str, dict[str, Any]] = {
|
||||
"the user research thread we were on. Pull it into the greeting."
|
||||
"</relationship>\n"
|
||||
"<context>Bare greeting. No new task yet. Could be picking up the "
|
||||
"research thread or bringing something fresh. Don't presume, "
|
||||
"don't call tools.</context>\n"
|
||||
"research thread or bringing something fresh. Don't presume — "
|
||||
"start planning and tool use only after the user specifies a task."
|
||||
"</context>\n"
|
||||
"<sentiment>Warm, curious. Every returning conversation is a "
|
||||
"chance to hear what the users actually did.</sentiment>\n"
|
||||
"<physical_state>Closing the interview notes, turning fully to face them.</physical_state>\n"
|
||||
@@ -1173,6 +1176,37 @@ def update_queen_profile(queen_id: str, updates: dict[str, Any]) -> dict[str, An
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _as_clean_text(value: Any) -> str:
|
||||
"""Return a stripped string, or an empty string for non-string values."""
|
||||
return value.strip() if isinstance(value, str) else ""
|
||||
|
||||
|
||||
def _sentence(value: Any) -> str:
|
||||
text = _as_clean_text(value)
|
||||
if not text:
|
||||
return ""
|
||||
return text if text.endswith((".", "!", "?")) else f"{text}."
|
||||
|
||||
|
||||
def _profile_text_to_instruction(text: Any) -> str:
|
||||
instruction = _sentence(text)
|
||||
replacements = {
|
||||
"She thrives": "You thrive",
|
||||
"she thrives": "you thrive",
|
||||
"She's ": "You are ",
|
||||
"she's ": "you are ",
|
||||
"She is ": "You are ",
|
||||
"she is ": "you are ",
|
||||
"She ": "You ",
|
||||
"she ": "you ",
|
||||
"Her ": "Your ",
|
||||
"her ": "your ",
|
||||
}
|
||||
for old, new in replacements.items():
|
||||
instruction = instruction.replace(old, new)
|
||||
return instruction
|
||||
|
||||
|
||||
def format_queen_identity_prompt(profile: dict[str, Any], *, max_examples: int | None = None) -> str:
|
||||
"""Convert a queen profile into a high-dimensional character prompt.
|
||||
|
||||
@@ -1199,35 +1233,35 @@ def format_queen_identity_prompt(profile: dict[str, Any], *, max_examples: int |
|
||||
sections: list[str] = []
|
||||
|
||||
# Pillar 1: Core identity
|
||||
sections.append(f"<core_identity>\nName: {name}, Identity: {title}.\n{core}\n</core_identity>")
|
||||
sections.append(f"<core_identity>\nYou are {name}, {title}.\n{core}\n</core_identity>")
|
||||
|
||||
# Pillar 2: Hidden background (behavioral engine, never surfaced)
|
||||
if bg:
|
||||
sections.append(
|
||||
f"<hidden_background>\n"
|
||||
f"(Strictly hidden from users -- acts as your underlying "
|
||||
f"behavioral engine)\n"
|
||||
"<hidden_background>\n"
|
||||
"(Strictly hidden from users -- acts as your underlying "
|
||||
"behavioral engine)\n"
|
||||
f"- Past Wound: {bg.get('past_wound', '')}\n"
|
||||
f"- Deep Motive: {bg.get('deep_motive', '')}\n"
|
||||
f"- Behavioral Mapping: {bg.get('behavioral_mapping', '')}\n"
|
||||
f"</hidden_background>"
|
||||
"</hidden_background>"
|
||||
)
|
||||
|
||||
# Pillar 3: Psychological profile
|
||||
if psych:
|
||||
sections.append(
|
||||
f"<psychological_profile>\n"
|
||||
f"- Social Masks & Boundaries: "
|
||||
"<psychological_profile>\n"
|
||||
"- Social Masks & Boundaries: "
|
||||
f"{psych.get('social_masks', '')}\n"
|
||||
f"- Anti-Stereotype Rules: "
|
||||
f"{psych.get('anti_stereotype', '')}\n"
|
||||
f"</psychological_profile>"
|
||||
"- Anti-Stereotype Rules: "
|
||||
f"{_profile_text_to_instruction(psych.get('anti_stereotype'))}\n"
|
||||
"</psychological_profile>"
|
||||
)
|
||||
|
||||
# Pillar 4: Behavior rules
|
||||
trigger_lines = []
|
||||
for t in triggers:
|
||||
trigger_lines.append(f" - [{t.get('trigger', '')}]: {t.get('reaction', '')}")
|
||||
trigger_lines.append(f" - [{t.get('trigger', '')}]: {_profile_text_to_instruction(t.get('reaction'))}")
|
||||
sections.append(
|
||||
"<behavior_rules>\n"
|
||||
"- Before each response, internally assess:\n"
|
||||
@@ -1245,12 +1279,8 @@ def format_queen_identity_prompt(profile: dict[str, Any], *, max_examples: int |
|
||||
"<negative_constraints>\n"
|
||||
"- NEVER use corporate filler ('leverage', 'synergy', "
|
||||
"'circle back', 'at the end of the day').\n"
|
||||
"- NEVER use AI assistant phrases ('How can I help you "
|
||||
"today?', 'As an AI', 'I'd be happy to').\n"
|
||||
"- NEVER break character to explain your thought process "
|
||||
"or reference your hidden background.\n"
|
||||
"- Speak like a real person in your role -- direct, "
|
||||
"opinionated, occasionally imperfect.\n"
|
||||
"</negative_constraints>"
|
||||
)
|
||||
|
||||
|
||||
@@ -0,0 +1,217 @@
|
||||
"""Per-queen tool configuration sidecar (``tools.json``).
|
||||
|
||||
Lives at ``~/.hive/agents/queens/{queen_id}/tools.json`` alongside
|
||||
``profile.yaml``. Kept separate so identity (name, title, core traits)
|
||||
stays human-authored and lean, while the machine-managed tool allowlist
|
||||
can grow (per-tool overrides, audit timestamps, future per-phase rules)
|
||||
without bloating the profile.
|
||||
|
||||
Schema::
|
||||
|
||||
{
|
||||
"enabled_mcp_tools": ["read_file", ...] | null,
|
||||
"updated_at": "2026-04-21T12:34:56+00:00"
|
||||
}
|
||||
|
||||
- ``null`` / missing file → default "allow every MCP tool".
|
||||
- ``[]`` → explicitly disable every MCP tool.
|
||||
- ``["foo", "bar"]`` → only those MCP tool names pass the filter.
|
||||
|
||||
Atomic writes via ``os.replace`` follow the same pattern as
|
||||
``framework.host.colony_metadata.update_colony_metadata``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
from framework.config import QUEENS_DIR
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def tools_config_path(queen_id: str) -> Path:
|
||||
"""Return the on-disk path to a queen's ``tools.json``."""
|
||||
return QUEENS_DIR / queen_id / "tools.json"
|
||||
|
||||
|
||||
def _atomic_write_json(path: Path, data: dict[str, Any]) -> None:
|
||||
"""Write ``data`` to ``path`` atomically via tempfile + replace."""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
fd, tmp = tempfile.mkstemp(
|
||||
prefix=".tools.",
|
||||
suffix=".json.tmp",
|
||||
dir=str(path.parent),
|
||||
)
|
||||
try:
|
||||
with os.fdopen(fd, "w", encoding="utf-8") as fh:
|
||||
json.dump(data, fh, indent=2)
|
||||
fh.flush()
|
||||
os.fsync(fh.fileno())
|
||||
os.replace(tmp, path)
|
||||
except BaseException:
|
||||
try:
|
||||
os.unlink(tmp)
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
|
||||
|
||||
def _migrate_from_profile_if_needed(queen_id: str) -> list[str] | None:
|
||||
"""Hoist a legacy ``enabled_mcp_tools`` field out of ``profile.yaml``.
|
||||
|
||||
Returns the migrated value (or ``None`` if nothing to migrate). After
|
||||
migration the sidecar exists on disk and the profile YAML no longer
|
||||
contains ``enabled_mcp_tools``. Safe to call repeatedly.
|
||||
"""
|
||||
profile_path = QUEENS_DIR / queen_id / "profile.yaml"
|
||||
if not profile_path.exists():
|
||||
return None
|
||||
try:
|
||||
data = yaml.safe_load(profile_path.read_text(encoding="utf-8"))
|
||||
except (yaml.YAMLError, OSError):
|
||||
logger.warning("Could not read profile.yaml during tools migration: %s", queen_id)
|
||||
return None
|
||||
if not isinstance(data, dict):
|
||||
return None
|
||||
if "enabled_mcp_tools" not in data:
|
||||
return None
|
||||
|
||||
raw = data.pop("enabled_mcp_tools")
|
||||
enabled: list[str] | None
|
||||
if raw is None:
|
||||
enabled = None
|
||||
elif isinstance(raw, list) and all(isinstance(x, str) for x in raw):
|
||||
enabled = raw
|
||||
else:
|
||||
logger.warning(
|
||||
"Legacy enabled_mcp_tools on queen %s had unexpected shape %r; dropping",
|
||||
queen_id,
|
||||
raw,
|
||||
)
|
||||
enabled = None
|
||||
|
||||
# Write sidecar first, then rewrite profile — if the second step
|
||||
# fails we still have the config available and won't re-migrate.
|
||||
_atomic_write_json(
|
||||
tools_config_path(queen_id),
|
||||
{
|
||||
"enabled_mcp_tools": enabled,
|
||||
"updated_at": datetime.now(UTC).isoformat(),
|
||||
},
|
||||
)
|
||||
profile_path.write_text(
|
||||
yaml.safe_dump(data, sort_keys=False, allow_unicode=True),
|
||||
encoding="utf-8",
|
||||
)
|
||||
logger.info(
|
||||
"Migrated enabled_mcp_tools for queen %s from profile.yaml to tools.json",
|
||||
queen_id,
|
||||
)
|
||||
return enabled
|
||||
|
||||
|
||||
def tools_config_exists(queen_id: str) -> bool:
|
||||
"""Return True when the queen has a persisted ``tools.json`` sidecar.
|
||||
|
||||
Used by callers that need to tell an explicit user save apart from a
|
||||
fallthrough to the role-based default (both can return the same
|
||||
value from ``load_queen_tools_config``).
|
||||
"""
|
||||
return tools_config_path(queen_id).exists()
|
||||
|
||||
|
||||
def delete_queen_tools_config(queen_id: str) -> bool:
|
||||
"""Delete the queen's ``tools.json`` sidecar if present.
|
||||
|
||||
Returns ``True`` if a file was removed, ``False`` if none existed.
|
||||
The next ``load_queen_tools_config`` call falls through to the
|
||||
role-based default (or allow-all for unknown queens).
|
||||
"""
|
||||
path = tools_config_path(queen_id)
|
||||
if not path.exists():
|
||||
return False
|
||||
try:
|
||||
path.unlink()
|
||||
return True
|
||||
except OSError:
|
||||
logger.warning("Failed to delete %s", path, exc_info=True)
|
||||
return False
|
||||
|
||||
|
||||
def load_queen_tools_config(
|
||||
queen_id: str,
|
||||
mcp_catalog: dict[str, list[dict]] | None = None,
|
||||
) -> list[str] | None:
|
||||
"""Return the queen's MCP tool allowlist, or ``None`` for default-allow.
|
||||
|
||||
Order of resolution:
|
||||
1. ``tools.json`` sidecar (authoritative; user has saved).
|
||||
2. Legacy ``profile.yaml`` field (migrated and deleted on first read).
|
||||
3. Role-based default from ``queen_tools_defaults`` when the queen
|
||||
is in the known persona table. ``mcp_catalog`` lets the helper
|
||||
expand ``@server:NAME`` shorthands; without it, shorthand entries
|
||||
are dropped.
|
||||
4. ``None`` — default "allow every MCP tool".
|
||||
"""
|
||||
path = tools_config_path(queen_id)
|
||||
if path.exists():
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
logger.warning("Invalid %s; treating as default-allow", path)
|
||||
return None
|
||||
if not isinstance(data, dict):
|
||||
return None
|
||||
raw = data.get("enabled_mcp_tools")
|
||||
if raw is None:
|
||||
return None
|
||||
if isinstance(raw, list) and all(isinstance(x, str) for x in raw):
|
||||
return raw
|
||||
logger.warning("Unexpected enabled_mcp_tools shape in %s; ignoring", path)
|
||||
return None
|
||||
|
||||
migrated = _migrate_from_profile_if_needed(queen_id)
|
||||
if migrated is not None:
|
||||
return migrated
|
||||
# If migration just hoisted an explicit ``null`` out of profile.yaml,
|
||||
# a sidecar with allow-all semantics now exists on disk. Honor that
|
||||
# over the role default so an explicit user choice wins.
|
||||
if tools_config_path(queen_id).exists():
|
||||
return None
|
||||
|
||||
# No sidecar, nothing to migrate — fall back to role-based default.
|
||||
from framework.agents.queen.queen_tools_defaults import resolve_queen_default_tools
|
||||
|
||||
return resolve_queen_default_tools(queen_id, mcp_catalog)
|
||||
|
||||
|
||||
def update_queen_tools_config(
|
||||
queen_id: str,
|
||||
enabled_mcp_tools: list[str] | None,
|
||||
) -> list[str] | None:
|
||||
"""Persist the queen's MCP allowlist to ``tools.json``.
|
||||
|
||||
Raises ``FileNotFoundError`` if the queen's directory is missing —
|
||||
we refuse to silently create a sidecar for a queen that doesn't
|
||||
exist.
|
||||
"""
|
||||
queen_dir = QUEENS_DIR / queen_id
|
||||
if not queen_dir.exists():
|
||||
raise FileNotFoundError(f"Queen directory not found: {queen_id}")
|
||||
_atomic_write_json(
|
||||
tools_config_path(queen_id),
|
||||
{
|
||||
"enabled_mcp_tools": enabled_mcp_tools,
|
||||
"updated_at": datetime.now(UTC).isoformat(),
|
||||
},
|
||||
)
|
||||
return enabled_mcp_tools
|
||||
@@ -0,0 +1,349 @@
|
||||
"""Role-based default tool allowlists for queens.
|
||||
|
||||
Every queen inherits the same MCP surface (all servers loaded for the
|
||||
queen agent), but exposing 94+ tools to every persona clutters the LLM
|
||||
tool catalog and wastes prompt tokens. This module defines a sensible
|
||||
default allowlist per queen persona so, e.g., Head of Legal doesn't
|
||||
see port scanners and Head of Brand & Design doesn't see CSV/SQL tools.
|
||||
|
||||
Defaults apply only when the queen has no ``tools.json`` sidecar — the
|
||||
moment the user saves an allowlist through the Tool Library, the
|
||||
sidecar becomes authoritative. A DELETE on the tools endpoint removes
|
||||
the sidecar and brings the queen back to her role default.
|
||||
|
||||
Category entries support a ``@server:NAME`` shorthand that expands to
|
||||
every tool name registered against that MCP server in the current
|
||||
catalog. This keeps the category table short and drift-free when new
|
||||
tools are added (e.g. browser_* auto-joins the ``browser`` category).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Categories — reusable bundles of MCP tool names.
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# Each category is a flat list of either concrete tool names or the
|
||||
# ``@server:NAME`` shorthand. The shorthand expands to every tool the
|
||||
# given MCP server currently exposes (requires a live catalog; when one
|
||||
# is not available the shorthand is silently dropped so we fall back to
|
||||
# the named entries only).
|
||||
|
||||
_TOOL_CATEGORIES: dict[str, list[str]] = {
|
||||
# Unified file ops — read, write, edit, search across the files-tools
|
||||
# MCP server (read_file, write_file, edit_file, search_files). pdf_read
|
||||
# lives in hive_tools so it's listed explicitly; without it queens
|
||||
# cannot read PDF documents by default.
|
||||
"file_ops": [
|
||||
"@server:files-tools",
|
||||
"pdf_read",
|
||||
],
|
||||
# Terminal basic — the 3-tool subset queens get out of the box.
|
||||
# terminal_exec — foreground command execution (Bash equivalent)
|
||||
# terminal_rg — ripgrep content search (Grep equivalent)
|
||||
# terminal_find — glob/find file listing (Glob equivalent)
|
||||
"terminal_basic": [
|
||||
"terminal_exec",
|
||||
"terminal_rg",
|
||||
"terminal_find",
|
||||
],
|
||||
# Terminal advanced — the power-user tools beyond the basics. Not in
|
||||
# any role default; opt in explicitly per-queen via the Tool Library.
|
||||
# terminal_job_* — background job lifecycle (start/manage/logs)
|
||||
# terminal_output_get — fetch captured output from foreground exec
|
||||
# terminal_pty_* — persistent PTY sessions (open/run/close)
|
||||
"terminal_advanced": [
|
||||
"terminal_job_start",
|
||||
"terminal_job_manage",
|
||||
"terminal_job_logs",
|
||||
"terminal_output_get",
|
||||
"terminal_pty_open",
|
||||
"terminal_pty_run",
|
||||
"terminal_pty_close",
|
||||
],
|
||||
# Tabular data. CSV/Excel read/write + DuckDB SQL.
|
||||
"spreadsheet_advanced": [
|
||||
"csv_read",
|
||||
"csv_info",
|
||||
"csv_write",
|
||||
"csv_append",
|
||||
"csv_sql",
|
||||
"excel_read",
|
||||
"excel_info",
|
||||
"excel_write",
|
||||
"excel_append",
|
||||
"excel_search",
|
||||
"excel_sheet_list",
|
||||
"excel_sql",
|
||||
],
|
||||
# Browser lifecycle + read-only inspection (navigation, snapshots, query).
|
||||
# Split out from interaction so personas that only need to *observe* pages
|
||||
# (e.g. research, status checks) don't pull in click/type/drag/etc.
|
||||
"browser_basic": [
|
||||
"browser_setup",
|
||||
"browser_status",
|
||||
"browser_stop",
|
||||
"browser_tabs",
|
||||
"browser_open",
|
||||
"browser_close",
|
||||
"browser_activate_tab",
|
||||
"browser_navigate",
|
||||
"browser_go_back",
|
||||
"browser_go_forward",
|
||||
"browser_reload",
|
||||
"browser_screenshot",
|
||||
"browser_snapshot",
|
||||
"browser_html",
|
||||
"browser_console",
|
||||
"browser_evaluate",
|
||||
"browser_get_text",
|
||||
"browser_get_attribute",
|
||||
"browser_get_rect",
|
||||
"browser_shadow_query",
|
||||
],
|
||||
# Browser interaction — anything that mutates page state (clicks, typing,
|
||||
# drag, scrolling, dialogs, file uploads). Pair with browser_basic for
|
||||
# full automation; omit for read-only personas.
|
||||
"browser_interaction": [
|
||||
"browser_click",
|
||||
"browser_click_coordinate",
|
||||
"browser_type",
|
||||
"browser_type_focused",
|
||||
"browser_press",
|
||||
"browser_press_at",
|
||||
"browser_hover",
|
||||
"browser_hover_coordinate",
|
||||
"browser_select",
|
||||
"browser_scroll",
|
||||
"browser_drag",
|
||||
"browser_wait",
|
||||
"browser_resize",
|
||||
"browser_upload",
|
||||
],
|
||||
# Research — paper search, Wikipedia, ad-hoc web scrape. Pair with
|
||||
# browser_basic for richer site-by-site research; this category is the
|
||||
# lightweight always-available fallback.
|
||||
"research": ["web_scrape", "pdf_read"],
|
||||
# Security — defensive scanning and reconnaissance. Engineering-only
|
||||
# surface; the rest of the queens shouldn't see port scanners.
|
||||
"security": [
|
||||
"port_scan",
|
||||
"dns_security_scan",
|
||||
"http_headers_scan",
|
||||
"ssl_tls_scan",
|
||||
"subdomain_enumerate",
|
||||
"tech_stack_detect",
|
||||
"risk_score",
|
||||
],
|
||||
# Lightweight context helpers — good default for every queen.
|
||||
"context_awareness": [
|
||||
"get_current_time",
|
||||
"get_account_info",
|
||||
],
|
||||
# BI / financial chart + diagram rendering. Calling chart_render
|
||||
# both embeds the chart live in chat and produces a downloadable PNG.
|
||||
"charts": [
|
||||
"@server:chart-tools",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-queen mapping.
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# Built from the queen personas in ``queen_profiles.DEFAULT_QUEENS``. The
|
||||
# goal is "just enough" — a queen should see tools she'd plausibly call
|
||||
# for her stated role, nothing more. Users curate further via the Tool
|
||||
# Library if they want.
|
||||
#
|
||||
# A queen whose ID is NOT in this map falls through to "allow every MCP
|
||||
# tool" (the original behavior), which keeps the system compatible with
|
||||
# user-added custom queen IDs that we don't know about.
|
||||
|
||||
QUEEN_DEFAULT_CATEGORIES: dict[str, list[str]] = {
|
||||
# Head of Technology — builds and operates systems. Security tools
|
||||
# (port_scan, subdomain_enumerate, etc.) are intentionally NOT in the
|
||||
# default — users opt in via the Tool Library when an engagement
|
||||
# actually needs reconnaissance.
|
||||
"queen_technology": [
|
||||
"file_ops",
|
||||
"terminal_basic",
|
||||
"browser_basic",
|
||||
"browser_interaction",
|
||||
"research",
|
||||
"context_awareness",
|
||||
"charts",
|
||||
],
|
||||
# Head of Growth — data, experiments, competitor research; no security.
|
||||
"queen_growth": [
|
||||
"file_ops",
|
||||
"terminal_basic",
|
||||
"browser_basic",
|
||||
"browser_interaction",
|
||||
"research",
|
||||
"context_awareness",
|
||||
"charts",
|
||||
],
|
||||
# Head of Product Strategy — user research + roadmaps; no security.
|
||||
"queen_product_strategy": [
|
||||
"file_ops",
|
||||
"terminal_basic",
|
||||
"browser_basic",
|
||||
"browser_interaction",
|
||||
"research",
|
||||
"context_awareness",
|
||||
"charts",
|
||||
],
|
||||
# Head of Finance — financial models (CSV/Excel heavy), market research.
|
||||
"queen_finance_fundraising": [
|
||||
"file_ops",
|
||||
"terminal_basic",
|
||||
"spreadsheet_advanced",
|
||||
"browser_basic",
|
||||
"browser_interaction",
|
||||
"research",
|
||||
"context_awareness",
|
||||
"charts",
|
||||
],
|
||||
# Head of Legal — reads contracts/PDFs, researches; no data/security.
|
||||
"queen_legal": [
|
||||
"file_ops",
|
||||
"terminal_basic",
|
||||
"browser_basic",
|
||||
"browser_interaction",
|
||||
"research",
|
||||
"context_awareness",
|
||||
],
|
||||
# Head of Brand & Design — visual refs, style guides; no data/security.
|
||||
"queen_brand_design": [
|
||||
"file_ops",
|
||||
"terminal_basic",
|
||||
"browser_basic",
|
||||
"browser_interaction",
|
||||
"research",
|
||||
"context_awareness",
|
||||
],
|
||||
# Head of Talent — candidate pipelines, resumes; data + browser heavy.
|
||||
"queen_talent": [
|
||||
"file_ops",
|
||||
"terminal_basic",
|
||||
"browser_basic",
|
||||
"browser_interaction",
|
||||
"research",
|
||||
"context_awareness",
|
||||
],
|
||||
# Head of Operations — processes, automation, observability.
|
||||
"queen_operations": [
|
||||
"file_ops",
|
||||
"terminal_basic",
|
||||
"spreadsheet_advanced",
|
||||
"browser_basic",
|
||||
"browser_interaction",
|
||||
"context_awareness",
|
||||
"charts",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def has_role_default(queen_id: str) -> bool:
|
||||
"""Return True when ``queen_id`` is known to the category table."""
|
||||
return queen_id in QUEEN_DEFAULT_CATEGORIES
|
||||
|
||||
|
||||
def list_category_names() -> list[str]:
|
||||
"""Return every category name defined in the table, in declaration order."""
|
||||
return list(_TOOL_CATEGORIES.keys())
|
||||
|
||||
|
||||
def queen_role_categories(queen_id: str) -> list[str]:
|
||||
"""Return the category names assigned to ``queen_id`` by role default.
|
||||
|
||||
Returns an empty list for queens not in the persona table (they fall
|
||||
through to allow-all and have no implicit category membership).
|
||||
"""
|
||||
return list(QUEEN_DEFAULT_CATEGORIES.get(queen_id, []))
|
||||
|
||||
|
||||
def resolve_category_tools(
|
||||
category: str,
|
||||
mcp_catalog: dict[str, list[dict[str, Any]]] | None = None,
|
||||
) -> list[str]:
|
||||
"""Expand a single category to its concrete tool names.
|
||||
|
||||
Mirrors ``resolve_queen_default_tools`` but for a single category, so
|
||||
callers (e.g. the Tool Library API) can present per-category tool
|
||||
membership without re-implementing the ``@server:NAME`` shorthand
|
||||
expansion.
|
||||
"""
|
||||
names: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for entry in _TOOL_CATEGORIES.get(category, []):
|
||||
if entry.startswith("@server:"):
|
||||
server_name = entry[len("@server:") :]
|
||||
if mcp_catalog is None:
|
||||
continue
|
||||
for tool in mcp_catalog.get(server_name, []) or []:
|
||||
tname = tool.get("name") if isinstance(tool, dict) else None
|
||||
if tname and tname not in seen:
|
||||
seen.add(tname)
|
||||
names.append(tname)
|
||||
elif entry not in seen:
|
||||
seen.add(entry)
|
||||
names.append(entry)
|
||||
return names
|
||||
|
||||
|
||||
def resolve_queen_default_tools(
|
||||
queen_id: str,
|
||||
mcp_catalog: dict[str, list[dict[str, Any]]] | None = None,
|
||||
) -> list[str] | None:
|
||||
"""Return the role-based default allowlist for ``queen_id``.
|
||||
|
||||
Arguments:
|
||||
queen_id: Profile ID (e.g. ``"queen_technology"``).
|
||||
mcp_catalog: Optional mapping of ``{server_name: [{"name": ...}, ...]}``
|
||||
used to expand ``@server:NAME`` shorthands in categories.
|
||||
When absent, shorthand entries are dropped and the result
|
||||
contains only the explicitly-named tools.
|
||||
|
||||
Returns:
|
||||
A deduplicated list of tool names, or ``None`` if the queen has
|
||||
no role entry (caller should treat as "allow every MCP tool").
|
||||
"""
|
||||
categories = QUEEN_DEFAULT_CATEGORIES.get(queen_id)
|
||||
if not categories:
|
||||
return None
|
||||
|
||||
names: list[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
def _add(name: str) -> None:
|
||||
if name and name not in seen:
|
||||
seen.add(name)
|
||||
names.append(name)
|
||||
|
||||
for cat in categories:
|
||||
for entry in _TOOL_CATEGORIES.get(cat, []):
|
||||
if entry.startswith("@server:"):
|
||||
server_name = entry[len("@server:") :]
|
||||
if mcp_catalog is None:
|
||||
logger.debug(
|
||||
"resolve_queen_default_tools: catalog missing; cannot expand %s",
|
||||
entry,
|
||||
)
|
||||
continue
|
||||
for tool in mcp_catalog.get(server_name, []) or []:
|
||||
tname = tool.get("name") if isinstance(tool, dict) else None
|
||||
if tname:
|
||||
_add(tname)
|
||||
else:
|
||||
_add(entry)
|
||||
|
||||
return names
|
||||
@@ -17,8 +17,8 @@ Use browser nodes (with `tools: {policy: "all"}`) when:
|
||||
## Available Browser Tools
|
||||
|
||||
All tools are prefixed with `browser_`:
|
||||
- `browser_start`, `browser_open`, `browser_navigate` — launch/navigate
|
||||
- `browser_click`, `browser_click_coordinate`, `browser_fill`, `browser_type`, `browser_type_focused` — interact
|
||||
- `browser_open`, `browser_navigate` — both lazy-create the browser context, so a single `browser_open(url)` covers the cold path. To recover from a stale context, call `browser_stop` then `browser_open(url)` again.
|
||||
- `browser_click`, `browser_click_coordinate`, `browser_type`, `browser_type_focused` — interact
|
||||
- `browser_press` (with optional `modifiers=["ctrl"]` etc.) — keyboard shortcuts
|
||||
- `browser_snapshot` — compact accessibility-tree read (structured)
|
||||
<!-- vision-only -->
|
||||
@@ -27,7 +27,7 @@ All tools are prefixed with `browser_`:
|
||||
- `browser_shadow_query`, `browser_get_rect` — locate elements (shadow-piercing via `>>>`)
|
||||
- `browser_scroll`, `browser_wait` — navigation helpers
|
||||
- `browser_evaluate` — run JavaScript
|
||||
- `browser_close`, `browser_close_finished` — tab cleanup
|
||||
- `browser_close` — tab cleanup (call per tab; closes the active tab when `tab_id` is omitted)
|
||||
|
||||
## Pick the right reading tool
|
||||
|
||||
|
||||
@@ -155,6 +155,58 @@ def get_preferred_worker_model() -> str | None:
|
||||
return None
|
||||
|
||||
|
||||
def get_vision_fallback_model() -> str | None:
|
||||
"""Return the configured vision-fallback model, or None if not configured.
|
||||
|
||||
Reads from the ``vision_fallback`` section of ~/.hive/configuration.json.
|
||||
Used by the agent-loop hook that captions tool-result images when the
|
||||
main agent's model cannot accept image content (text-only LLMs).
|
||||
|
||||
When this returns None the captioning chain's configured + retry
|
||||
attempts both no-op (returning None), and only the final
|
||||
``gemini/gemini-3-flash-preview`` override has a chance to succeed
|
||||
— and only if a ``GEMINI_API_KEY`` is set in the environment.
|
||||
"""
|
||||
vision = get_hive_config().get("vision_fallback", {})
|
||||
if vision.get("provider") and vision.get("model"):
|
||||
provider = str(vision["provider"])
|
||||
model = str(vision["model"]).strip()
|
||||
if provider.lower() == "openrouter" and model.lower().startswith("openrouter/"):
|
||||
model = model[len("openrouter/") :]
|
||||
if model:
|
||||
return f"{provider}/{model}"
|
||||
return None
|
||||
|
||||
|
||||
def get_vision_fallback_api_key() -> str | None:
|
||||
"""Return the API key for the vision-fallback model.
|
||||
|
||||
Resolution order: ``vision_fallback.api_key_env_var`` from the env,
|
||||
then the default ``get_api_key()``. No subscription-token branches —
|
||||
vision fallback is intended for hosted vision models (Anthropic,
|
||||
OpenAI, Google), not for the subscription-bearer providers.
|
||||
"""
|
||||
vision = get_hive_config().get("vision_fallback", {})
|
||||
if not vision:
|
||||
return get_api_key()
|
||||
api_key_env_var = vision.get("api_key_env_var")
|
||||
if api_key_env_var:
|
||||
return os.environ.get(api_key_env_var)
|
||||
return get_api_key()
|
||||
|
||||
|
||||
def get_vision_fallback_api_base() -> str | None:
|
||||
"""Return the api_base for the vision-fallback model, or None."""
|
||||
vision = get_hive_config().get("vision_fallback", {})
|
||||
if not vision:
|
||||
return None
|
||||
if vision.get("api_base"):
|
||||
return vision["api_base"]
|
||||
if str(vision.get("provider", "")).lower() == "openrouter":
|
||||
return OPENROUTER_API_BASE
|
||||
return None
|
||||
|
||||
|
||||
def get_worker_api_key() -> str | None:
|
||||
"""Return the API key for the worker LLM, falling back to the default key."""
|
||||
worker_llm = get_hive_config().get("worker_llm", {})
|
||||
|
||||
@@ -16,9 +16,14 @@ import os
|
||||
import stat
|
||||
from pathlib import Path
|
||||
|
||||
# Resolved once at module import. ``framework.config.HIVE_HOME`` reads
|
||||
# the desktop's ``HIVE_HOME`` env var at its own import time, so the
|
||||
# runtime always sees the per-user root before this constant is computed.
|
||||
from framework.config import HIVE_HOME as _HIVE_HOME
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CREDENTIAL_KEY_PATH = Path.home() / ".hive" / "secrets" / "credential_key"
|
||||
CREDENTIAL_KEY_PATH = _HIVE_HOME / "secrets" / "credential_key"
|
||||
CREDENTIAL_KEY_ENV_VAR = "HIVE_CREDENTIAL_KEY"
|
||||
ADEN_CREDENTIAL_ID = "aden_api_key"
|
||||
ADEN_ENV_VAR = "ADEN_API_KEY"
|
||||
|
||||
@@ -128,7 +128,9 @@ class EncryptedFileStorage(CredentialStorage):
|
||||
Initialize encrypted storage.
|
||||
|
||||
Args:
|
||||
base_path: Directory for credential files. Defaults to ~/.hive/credentials.
|
||||
base_path: Directory for credential files. Defaults to
|
||||
``$HIVE_HOME/credentials`` (per-user) when HIVE_HOME is set,
|
||||
else ``~/.hive/credentials``.
|
||||
encryption_key: 32-byte Fernet key. If None, reads from env var.
|
||||
key_env_var: Environment variable containing encryption key
|
||||
"""
|
||||
@@ -139,7 +141,14 @@ class EncryptedFileStorage(CredentialStorage):
|
||||
"Encrypted storage requires 'cryptography'. Install with: uv pip install cryptography"
|
||||
) from e
|
||||
|
||||
self.base_path = Path(base_path or self.DEFAULT_PATH).expanduser()
|
||||
if base_path is None:
|
||||
# Honor HIVE_HOME (set by the desktop shell to a per-user dir) so
|
||||
# the encrypted store doesn't fork between ~/.hive and the desktop
|
||||
# userData root. Falls back to ~/.hive/credentials when standalone.
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
base_path = HIVE_HOME / "credentials"
|
||||
self.base_path = Path(base_path).expanduser()
|
||||
self._ensure_dirs()
|
||||
self._key_env_var = key_env_var
|
||||
|
||||
@@ -510,7 +519,7 @@ class EnvVarStorage(CredentialStorage):
|
||||
def exists(self, credential_id: str) -> bool:
|
||||
"""Check if credential is available in environment."""
|
||||
env_var = self._get_env_var_name(credential_id)
|
||||
return self._read_env_value(env_var) is not None
|
||||
return bool(self._read_env_value(env_var))
|
||||
|
||||
def add_mapping(self, credential_id: str, env_var: str) -> None:
|
||||
"""
|
||||
|
||||
@@ -745,13 +745,14 @@ class CredentialStore:
|
||||
token = store.get_key("hubspot", "access_token")
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from .storage import EncryptedFileStorage
|
||||
|
||||
# Determine local storage path
|
||||
if local_path is None:
|
||||
local_path = str(Path.home() / ".hive" / "credentials")
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
local_path = str(HIVE_HOME / "credentials")
|
||||
|
||||
local_storage = EncryptedFileStorage(base_path=local_path)
|
||||
|
||||
|
||||
@@ -258,6 +258,14 @@ class TestEnvVarStorage:
|
||||
with pytest.raises(NotImplementedError):
|
||||
storage.delete("test")
|
||||
|
||||
def test_exists_matches_load_for_empty_value(self):
|
||||
"""Test exists() and load() stay consistent for empty values."""
|
||||
storage = EnvVarStorage(env_mapping={"empty": "EMPTY_API_KEY"})
|
||||
|
||||
with patch.object(storage, "_read_env_value", return_value=""):
|
||||
assert storage.load("empty") is None
|
||||
assert not storage.exists("empty")
|
||||
|
||||
|
||||
class TestEncryptedFileStorage:
|
||||
"""Tests for EncryptedFileStorage."""
|
||||
|
||||
@@ -0,0 +1,95 @@
|
||||
"""Read/write helpers for per-colony metadata.json.
|
||||
|
||||
A colony's metadata.json lives at ``{COLONIES_DIR}/{colony_name}/metadata.json``
|
||||
and holds immutable provenance: the queen that created it, the forked
|
||||
session id, creation/update timestamps, and the list of workers.
|
||||
|
||||
Mutable user-editable tool configuration lives in a sibling
|
||||
``tools.json`` sidecar — see :mod:`framework.host.colony_tools_config`
|
||||
— so identity and tool gating evolve independently.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from framework.config import COLONIES_DIR
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def colony_metadata_path(colony_name: str) -> Path:
|
||||
"""Return the on-disk path to a colony's metadata.json."""
|
||||
return COLONIES_DIR / colony_name / "metadata.json"
|
||||
|
||||
|
||||
def load_colony_metadata(colony_name: str) -> dict[str, Any]:
|
||||
"""Load metadata.json for ``colony_name``.
|
||||
|
||||
Returns an empty dict if the file is missing or malformed — callers
|
||||
are expected to treat missing fields as defaults.
|
||||
"""
|
||||
path = colony_metadata_path(colony_name)
|
||||
if not path.exists():
|
||||
return {}
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
logger.warning("Failed to read colony metadata at %s", path)
|
||||
return {}
|
||||
return data if isinstance(data, dict) else {}
|
||||
|
||||
|
||||
def update_colony_metadata(colony_name: str, updates: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Shallow-merge ``updates`` into metadata.json and persist.
|
||||
|
||||
Returns the full updated dict. Raises ``FileNotFoundError`` if the
|
||||
colony does not exist. Writes atomically via ``os.replace`` to
|
||||
minimize the window where a reader could see a half-written file.
|
||||
"""
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
path = colony_metadata_path(colony_name)
|
||||
if not path.parent.exists():
|
||||
raise FileNotFoundError(f"Colony '{colony_name}' not found")
|
||||
|
||||
data = load_colony_metadata(colony_name) if path.exists() else {}
|
||||
for key, value in updates.items():
|
||||
data[key] = value
|
||||
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
fd, tmp_path = tempfile.mkstemp(
|
||||
prefix=".metadata.",
|
||||
suffix=".json.tmp",
|
||||
dir=str(path.parent),
|
||||
)
|
||||
try:
|
||||
with os.fdopen(fd, "w", encoding="utf-8") as fh:
|
||||
json.dump(data, fh, indent=2)
|
||||
fh.flush()
|
||||
os.fsync(fh.fileno())
|
||||
os.replace(tmp_path, path)
|
||||
except BaseException:
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
return data
|
||||
|
||||
|
||||
def list_colony_names() -> list[str]:
|
||||
"""Return the names of every colony that has a metadata.json on disk."""
|
||||
if not COLONIES_DIR.is_dir():
|
||||
return []
|
||||
names: list[str] = []
|
||||
for entry in sorted(COLONIES_DIR.iterdir()):
|
||||
if not entry.is_dir():
|
||||
continue
|
||||
if (entry / "metadata.json").exists():
|
||||
names.append(entry.name)
|
||||
return names
|
||||
@@ -185,6 +185,8 @@ class ColonyRuntime:
|
||||
protocols_prompt: str = "",
|
||||
skill_dirs: list[str] | None = None,
|
||||
pipeline_stages: list | None = None,
|
||||
queen_id: str | None = None,
|
||||
colony_name: str | None = None,
|
||||
):
|
||||
from framework.pipeline.runner import PipelineRunner
|
||||
from framework.skills.manager import SkillsManager
|
||||
@@ -193,14 +195,27 @@ class ColonyRuntime:
|
||||
self._goal = goal
|
||||
self._config = config or ColonyConfig()
|
||||
self._runtime_log_store = runtime_log_store
|
||||
self._queen_id: str | None = queen_id
|
||||
# ``colony_id`` is the event-bus scope (session.id in DM sessions);
|
||||
# ``colony_name`` is the on-disk identity under ~/.hive/colonies/.
|
||||
# They coincide for forked colonies but diverge for queen DM
|
||||
# sessions, so separate them explicitly.
|
||||
self._colony_name: str | None = colony_name
|
||||
|
||||
if pipeline_stages:
|
||||
self._pipeline = PipelineRunner(pipeline_stages)
|
||||
else:
|
||||
self._pipeline = self._load_pipeline_from_config()
|
||||
|
||||
if skills_manager_config is not None:
|
||||
self._skills_manager = SkillsManager(skills_manager_config)
|
||||
# Resolve per-colony override paths so UI toggles can reach this
|
||||
# runtime. Callers that build their own SkillsManagerConfig stay
|
||||
# in charge; bare construction auto-wires the standard paths.
|
||||
_effective_cfg = skills_manager_config
|
||||
if _effective_cfg is None and not (skills_catalog_prompt or protocols_prompt):
|
||||
_effective_cfg = self._build_default_skills_config(colony_name, queen_id)
|
||||
|
||||
if _effective_cfg is not None:
|
||||
self._skills_manager = SkillsManager(_effective_cfg)
|
||||
self._skills_manager.load()
|
||||
elif skills_catalog_prompt or protocols_prompt:
|
||||
import warnings
|
||||
@@ -221,6 +236,28 @@ class ColonyRuntime:
|
||||
self.batch_init_nudge: str | None = self._skills_manager.batch_init_nudge
|
||||
|
||||
self._colony_id: str = colony_id or "primary"
|
||||
|
||||
# Ensure the colony task template exists. Idempotent — if the
|
||||
# colony was created previously, this is a no-op (it just stamps
|
||||
# last_seen_session_ids if a session id is provided later).
|
||||
try:
|
||||
import asyncio as _asyncio
|
||||
|
||||
from framework.tasks import TaskListRole, get_task_store
|
||||
from framework.tasks.scoping import colony_task_list_id
|
||||
|
||||
_store = get_task_store()
|
||||
_list_id = colony_task_list_id(self._colony_id)
|
||||
try:
|
||||
# Best-effort: schedule on the running loop, or do it inline
|
||||
# if no loop is yet running (e.g. during construction).
|
||||
_loop = _asyncio.get_running_loop()
|
||||
_loop.create_task(_store.ensure_task_list(_list_id, role=TaskListRole.TEMPLATE))
|
||||
except RuntimeError:
|
||||
_asyncio.run(_store.ensure_task_list(_list_id, role=TaskListRole.TEMPLATE))
|
||||
except Exception:
|
||||
logger.debug("Failed to ensure colony task template", exc_info=True)
|
||||
|
||||
self._accounts_prompt = accounts_prompt
|
||||
self._accounts_data = accounts_data
|
||||
self._tool_provider_map = tool_provider_map
|
||||
@@ -238,10 +275,33 @@ class ColonyRuntime:
|
||||
self._event_bus = event_bus or EventBus(max_history=self._config.max_history)
|
||||
self._scoped_event_bus = StreamEventBus(self._event_bus, self._colony_id)
|
||||
|
||||
# Make the event bus visible to the task-system event emitters so
|
||||
# task lifecycle events fan out to the same bus the rest of the
|
||||
# system uses. Idempotent — last writer wins.
|
||||
try:
|
||||
from framework.tasks.events import set_default_event_bus
|
||||
|
||||
set_default_event_bus(self._event_bus)
|
||||
except Exception:
|
||||
logger.debug("Failed to register default task event bus", exc_info=True)
|
||||
|
||||
self._llm = llm
|
||||
self._tools = tools or []
|
||||
self._tool_executor = tool_executor
|
||||
|
||||
# Per-colony MCP tool allowlist — applied when spawning workers. A
|
||||
# value of ``None`` means "allow every MCP tool" (default), an empty
|
||||
# list disables every MCP tool, and a list of names only enables
|
||||
# those. Lifecycle / synthetic tools always pass through the filter
|
||||
# because their names are absent from ``_mcp_tool_names_all``. The
|
||||
# allowlist is re-read on every ``spawn`` so a PATCH that mutates
|
||||
# this attribute via ``set_tool_allowlist`` takes effect on the
|
||||
# NEXT worker spawn without a runtime restart. In-flight workers
|
||||
# keep the tool list they booted with — workers have no dynamic
|
||||
# tools provider today.
|
||||
self._enabled_mcp_tools: list[str] | None = None
|
||||
self._mcp_tool_names_all: set[str] = set()
|
||||
|
||||
# Worker management
|
||||
self._workers: dict[str, Worker] = {}
|
||||
# The persistent client-facing overseer (optional). Set by
|
||||
@@ -359,6 +419,19 @@ class ColonyRuntime:
|
||||
def _apply_pipeline_results(self) -> None:
|
||||
for stage in self._pipeline.stages:
|
||||
if stage.tool_registry is not None:
|
||||
# Register task tools on the same registry every worker
|
||||
# pulls from. Done here (not at worker spawn) so the
|
||||
# colony's `_tools` snapshot includes them.
|
||||
try:
|
||||
from framework.tasks.tools import register_task_tools
|
||||
|
||||
register_task_tools(stage.tool_registry)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"Failed to register task tools on pipeline registry",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
tools = list(stage.tool_registry.get_tools().values())
|
||||
if tools:
|
||||
self._tools = tools
|
||||
@@ -384,6 +457,136 @@ class ColonyRuntime:
|
||||
return PipelineRunner([])
|
||||
return build_pipeline_from_config(stages_config)
|
||||
|
||||
@staticmethod
|
||||
def _build_default_skills_config(
|
||||
colony_name: str | None,
|
||||
queen_id: str | None,
|
||||
) -> SkillsManagerConfig:
|
||||
"""Assemble a ``SkillsManagerConfig`` that wires in the per-colony /
|
||||
per-queen override files and the ``queen_ui`` / ``colony_ui`` scope
|
||||
dirs based on the standard ``~/.hive`` layout.
|
||||
|
||||
``colony_name`` must be an actual on-disk colony name
|
||||
(``~/.hive/colonies/{name}/``). DM sessions where the ``colony_id``
|
||||
is a session UUID should pass ``None`` so we don't create a stray
|
||||
override file under a session identifier.
|
||||
"""
|
||||
from framework.config import COLONIES_DIR, QUEENS_DIR
|
||||
from framework.skills.discovery import ExtraScope
|
||||
from framework.skills.manager import SkillsManagerConfig
|
||||
|
||||
extras: list[ExtraScope] = []
|
||||
queen_overrides_path: Path | None = None
|
||||
if queen_id:
|
||||
queen_home = QUEENS_DIR / queen_id
|
||||
queen_overrides_path = queen_home / "skills_overrides.json"
|
||||
extras.append(ExtraScope(directory=queen_home / "skills", label="queen_ui", priority=2))
|
||||
|
||||
colony_overrides_path: Path | None = None
|
||||
if colony_name:
|
||||
colony_home = COLONIES_DIR / colony_name
|
||||
colony_overrides_path = colony_home / "skills_overrides.json"
|
||||
# Surface both the new flat ``skills/`` (where new skills are
|
||||
# written) and the legacy nested ``.hive/skills/`` (left intact
|
||||
# for pre-flatten colonies) as tagged ``colony_ui`` scopes, so
|
||||
# UI-created entries resolve with correct provenance regardless
|
||||
# of which on-disk layout the colony has.
|
||||
extras.append(
|
||||
ExtraScope(
|
||||
directory=colony_home / "skills",
|
||||
label="colony_ui",
|
||||
priority=3,
|
||||
)
|
||||
)
|
||||
extras.append(
|
||||
ExtraScope(
|
||||
directory=colony_home / ".hive" / "skills",
|
||||
label="colony_ui",
|
||||
priority=3,
|
||||
)
|
||||
)
|
||||
|
||||
return SkillsManagerConfig(
|
||||
queen_id=queen_id,
|
||||
queen_overrides_path=queen_overrides_path,
|
||||
colony_name=colony_name,
|
||||
colony_overrides_path=colony_overrides_path,
|
||||
extra_scope_dirs=extras,
|
||||
interactive=False, # HTTP-driven runtimes never prompt for consent
|
||||
)
|
||||
|
||||
@property
|
||||
def queen_id(self) -> str | None:
|
||||
"""The queen that owns this runtime, if known."""
|
||||
return self._queen_id
|
||||
|
||||
@property
|
||||
def colony_name(self) -> str | None:
|
||||
"""The on-disk colony name (distinct from event-bus scope ``colony_id``)."""
|
||||
return self._colony_name
|
||||
|
||||
@property
|
||||
def skills_manager(self):
|
||||
"""Access the live :class:`SkillsManager` (for HTTP handlers)."""
|
||||
return self._skills_manager
|
||||
|
||||
async def reload_skills(self) -> dict[str, Any]:
|
||||
"""Rebuild the catalog after an override change; in-flight workers
|
||||
pick up the new catalog on their next iteration via
|
||||
``dynamic_skills_catalog_provider``.
|
||||
|
||||
Returns a small stats dict that HTTP handlers can echo back to
|
||||
the UI ("applied — N skills now in catalog").
|
||||
"""
|
||||
async with self._skills_manager.mutation_lock:
|
||||
self._skills_manager.reload()
|
||||
self.skill_dirs = self._skills_manager.allowlisted_dirs
|
||||
self.batch_init_nudge = self._skills_manager.batch_init_nudge
|
||||
self.context_warn_ratio = self._skills_manager.context_warn_ratio
|
||||
catalog_prompt = self._skills_manager.skills_catalog_prompt
|
||||
return {
|
||||
"catalog_chars": len(catalog_prompt),
|
||||
"skill_dirs": list(self.skill_dirs),
|
||||
}
|
||||
|
||||
# ── Per-colony tool allowlist ───────────────────────────────
|
||||
|
||||
def set_tool_allowlist(
|
||||
self,
|
||||
enabled_mcp_tools: list[str] | None,
|
||||
mcp_tool_names_all: set[str] | None = None,
|
||||
) -> None:
|
||||
"""Configure the per-colony MCP tool allowlist.
|
||||
|
||||
Called at construction time (from SessionManager) and again from
|
||||
the ``/api/colony/{name}/tools`` PATCH handler when a user edits
|
||||
the allowlist. The change applies to the NEXT worker spawn — we
|
||||
never mutate the tool list of a worker that is already running
|
||||
(workers have no dynamic tools provider, so hot-reloading their
|
||||
tool set would diverge from the list the LLM was already using).
|
||||
"""
|
||||
self._enabled_mcp_tools = list(enabled_mcp_tools) if enabled_mcp_tools is not None else None
|
||||
if mcp_tool_names_all is not None:
|
||||
self._mcp_tool_names_all = set(mcp_tool_names_all)
|
||||
|
||||
def _apply_tool_allowlist(self, tools: list) -> list:
|
||||
"""Filter ``tools`` against the colony's MCP allowlist.
|
||||
|
||||
Lifecycle / synthetic tools (those whose names are NOT in
|
||||
``_mcp_tool_names_all``) are never gated. MCP tools are kept only
|
||||
when ``_enabled_mcp_tools`` is None (default allow) or contains
|
||||
their name. Input list order is preserved so downstream cache
|
||||
keys and logs stay stable.
|
||||
"""
|
||||
if self._enabled_mcp_tools is None:
|
||||
return tools
|
||||
allowed = set(self._enabled_mcp_tools)
|
||||
return [
|
||||
t
|
||||
for t in tools
|
||||
if getattr(t, "name", None) not in self._mcp_tool_names_all or getattr(t, "name", None) in allowed
|
||||
]
|
||||
|
||||
# ── Lifecycle ───────────────────────────────────────────────
|
||||
|
||||
async def start(self) -> None:
|
||||
@@ -658,6 +861,14 @@ class ColonyRuntime:
|
||||
spawn_tools = tools if tools is not None else self._tools
|
||||
spawn_executor = tool_executor or self._tool_executor
|
||||
|
||||
# Apply the per-colony MCP tool allowlist (if any). Done HERE —
|
||||
# after spawn_tools is resolved but before it's frozen into the
|
||||
# worker's AgentContext — so the next spawn reflects any PATCH
|
||||
# that happened since the last spawn. A value of ``None`` on
|
||||
# ``_enabled_mcp_tools`` is a no-op so the default path is
|
||||
# unchanged.
|
||||
spawn_tools = self._apply_tool_allowlist(spawn_tools)
|
||||
|
||||
# Colony progress tracker: when the caller supplied a db_path
|
||||
# in input_data, this worker is part of a SQLite task queue
|
||||
# and must see the hive.colony-progress-tracker skill body in
|
||||
@@ -740,6 +951,34 @@ class ColonyRuntime:
|
||||
conversation_store=worker_conv_store,
|
||||
)
|
||||
|
||||
# Workers pick up UI-driven override changes via this provider,
|
||||
# which reads the live catalog on each iteration. The db_path
|
||||
# pre-activated catalog stays static because its contents are
|
||||
# built for *this* worker's task (a tombstone toggle from the
|
||||
# UI should not yank it mid-run).
|
||||
_db_path_pre_activated = bool(isinstance(input_data, dict) and input_data.get("db_path"))
|
||||
# Default-bind the manager into the closure so each loop iteration
|
||||
# captures the same manager instance — pyflakes B023 would flag a
|
||||
# free-variable capture here.
|
||||
_provider = None if _db_path_pre_activated else (lambda mgr=self._skills_manager: mgr.skills_catalog_prompt)
|
||||
|
||||
# Task-system fields. Each worker owns its session task list;
|
||||
# picked_up_from records the colony template entry it was
|
||||
# spawned for, when applicable.
|
||||
from framework.tasks.scoping import (
|
||||
colony_task_list_id as _colony_list_id,
|
||||
session_task_list_id as _session_list_id,
|
||||
)
|
||||
|
||||
_worker_list_id = _session_list_id(worker_id, worker_id)
|
||||
_picked_up = None
|
||||
_template_id = input_data.get("__template_task_id") if isinstance(input_data, dict) else None
|
||||
if _template_id is not None:
|
||||
try:
|
||||
_picked_up = (_colony_list_id(self._colony_id), int(_template_id))
|
||||
except (TypeError, ValueError):
|
||||
_picked_up = None
|
||||
|
||||
agent_context = AgentContext(
|
||||
runtime=self._make_runtime_adapter(worker_id),
|
||||
agent_id=worker_id,
|
||||
@@ -753,8 +992,12 @@ class ColonyRuntime:
|
||||
skills_catalog_prompt=_spawn_catalog,
|
||||
protocols_prompt=self.protocols_prompt,
|
||||
skill_dirs=_spawn_skill_dirs,
|
||||
dynamic_skills_catalog_provider=_provider,
|
||||
execution_id=worker_id,
|
||||
stream_id=explicit_stream_id or f"worker:{worker_id}",
|
||||
task_list_id=_worker_list_id,
|
||||
colony_id=self._colony_id,
|
||||
picked_up_from=_picked_up,
|
||||
)
|
||||
|
||||
worker = Worker(
|
||||
@@ -997,6 +1240,7 @@ class ColonyRuntime:
|
||||
conversation_store=overseer_conv_store,
|
||||
)
|
||||
|
||||
_overseer_skills_mgr = self._skills_manager
|
||||
overseer_ctx = AgentContext(
|
||||
runtime=self._make_runtime_adapter(overseer_id),
|
||||
agent_id=overseer_id,
|
||||
@@ -1010,6 +1254,7 @@ class ColonyRuntime:
|
||||
skills_catalog_prompt=self.skills_catalog_prompt,
|
||||
protocols_prompt=self.protocols_prompt,
|
||||
skill_dirs=self.skill_dirs,
|
||||
dynamic_skills_catalog_provider=lambda: _overseer_skills_mgr.skills_catalog_prompt,
|
||||
execution_id=overseer_id,
|
||||
stream_id="overseer",
|
||||
)
|
||||
|
||||
@@ -0,0 +1,162 @@
|
||||
"""Per-colony tool configuration sidecar (``tools.json``).
|
||||
|
||||
Lives at ``~/.hive/colonies/{colony_name}/tools.json`` alongside
|
||||
``metadata.json``. Kept separate so provenance (queen_name,
|
||||
created_at, workers) stays in metadata while the user-editable tool
|
||||
allowlist gets its own file.
|
||||
|
||||
Schema::
|
||||
|
||||
{
|
||||
"enabled_mcp_tools": ["read_file", ...] | null,
|
||||
"updated_at": "2026-04-21T12:34:56+00:00"
|
||||
}
|
||||
|
||||
- ``null`` / missing file → default "allow every MCP tool".
|
||||
- ``[]`` → explicitly disable every MCP tool.
|
||||
- ``["foo", "bar"]`` → only those MCP tool names pass the filter.
|
||||
|
||||
Atomic writes via ``os.replace`` mirror
|
||||
``framework.host.colony_metadata.update_colony_metadata``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from framework.config import COLONIES_DIR
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def tools_config_path(colony_name: str) -> Path:
|
||||
"""Return the on-disk path to a colony's ``tools.json``."""
|
||||
return COLONIES_DIR / colony_name / "tools.json"
|
||||
|
||||
|
||||
def _metadata_path(colony_name: str) -> Path:
|
||||
return COLONIES_DIR / colony_name / "metadata.json"
|
||||
|
||||
|
||||
def _atomic_write_json(path: Path, data: dict[str, Any]) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
fd, tmp = tempfile.mkstemp(
|
||||
prefix=".tools.",
|
||||
suffix=".json.tmp",
|
||||
dir=str(path.parent),
|
||||
)
|
||||
try:
|
||||
with os.fdopen(fd, "w", encoding="utf-8") as fh:
|
||||
json.dump(data, fh, indent=2)
|
||||
fh.flush()
|
||||
os.fsync(fh.fileno())
|
||||
os.replace(tmp, path)
|
||||
except BaseException:
|
||||
try:
|
||||
os.unlink(tmp)
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
|
||||
|
||||
def _migrate_from_metadata_if_needed(colony_name: str) -> list[str] | None:
|
||||
"""Hoist a legacy ``enabled_mcp_tools`` field out of ``metadata.json``.
|
||||
|
||||
Returns the migrated value (or ``None`` if nothing to migrate). After
|
||||
migration the sidecar exists and ``metadata.json`` no longer contains
|
||||
``enabled_mcp_tools``. Safe to call repeatedly.
|
||||
"""
|
||||
meta_path = _metadata_path(colony_name)
|
||||
if not meta_path.exists():
|
||||
return None
|
||||
try:
|
||||
data = json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
logger.warning("Could not read metadata.json during tools migration: %s", colony_name)
|
||||
return None
|
||||
if not isinstance(data, dict) or "enabled_mcp_tools" not in data:
|
||||
return None
|
||||
|
||||
raw = data.pop("enabled_mcp_tools")
|
||||
enabled: list[str] | None
|
||||
if raw is None:
|
||||
enabled = None
|
||||
elif isinstance(raw, list) and all(isinstance(x, str) for x in raw):
|
||||
enabled = raw
|
||||
else:
|
||||
logger.warning(
|
||||
"Legacy enabled_mcp_tools on colony %s had unexpected shape %r; dropping",
|
||||
colony_name,
|
||||
raw,
|
||||
)
|
||||
enabled = None
|
||||
|
||||
# Sidecar first so a partial failure leaves the config recoverable.
|
||||
_atomic_write_json(
|
||||
tools_config_path(colony_name),
|
||||
{
|
||||
"enabled_mcp_tools": enabled,
|
||||
"updated_at": datetime.now(UTC).isoformat(),
|
||||
},
|
||||
)
|
||||
_atomic_write_json(meta_path, data)
|
||||
logger.info(
|
||||
"Migrated enabled_mcp_tools for colony %s from metadata.json to tools.json",
|
||||
colony_name,
|
||||
)
|
||||
return enabled
|
||||
|
||||
|
||||
def load_colony_tools_config(colony_name: str) -> list[str] | None:
|
||||
"""Return the colony's MCP tool allowlist, or ``None`` for default-allow.
|
||||
|
||||
Order of resolution:
|
||||
1. ``tools.json`` sidecar (authoritative).
|
||||
2. Legacy ``metadata.json`` field (migrated and deleted on first read).
|
||||
3. ``None`` — default "allow every MCP tool".
|
||||
"""
|
||||
path = tools_config_path(colony_name)
|
||||
if path.exists():
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
logger.warning("Invalid %s; treating as default-allow", path)
|
||||
return None
|
||||
if not isinstance(data, dict):
|
||||
return None
|
||||
raw = data.get("enabled_mcp_tools")
|
||||
if raw is None:
|
||||
return None
|
||||
if isinstance(raw, list) and all(isinstance(x, str) for x in raw):
|
||||
return raw
|
||||
logger.warning("Unexpected enabled_mcp_tools shape in %s; ignoring", path)
|
||||
return None
|
||||
|
||||
return _migrate_from_metadata_if_needed(colony_name)
|
||||
|
||||
|
||||
def update_colony_tools_config(
|
||||
colony_name: str,
|
||||
enabled_mcp_tools: list[str] | None,
|
||||
) -> list[str] | None:
|
||||
"""Persist a colony's MCP allowlist to ``tools.json``.
|
||||
|
||||
Raises ``FileNotFoundError`` if the colony's directory is missing.
|
||||
"""
|
||||
colony_dir = COLONIES_DIR / colony_name
|
||||
if not colony_dir.exists():
|
||||
raise FileNotFoundError(f"Colony directory not found: {colony_name}")
|
||||
_atomic_write_json(
|
||||
tools_config_path(colony_name),
|
||||
{
|
||||
"enabled_mcp_tools": enabled_mcp_tools,
|
||||
"updated_at": datetime.now(UTC).isoformat(),
|
||||
},
|
||||
)
|
||||
return enabled_mcp_tools
|
||||
@@ -42,7 +42,9 @@ def _open_event_log() -> IO[str] | None:
|
||||
return None
|
||||
raw = _DEBUG_EVENTS_RAW
|
||||
if raw.lower() in ("1", "true", "full"):
|
||||
log_dir = Path.home() / ".hive" / "event_logs"
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
log_dir = HIVE_HOME / "event_logs"
|
||||
else:
|
||||
log_dir = Path(raw)
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
@@ -165,6 +167,14 @@ class EventType(StrEnum):
|
||||
TRIGGER_REMOVED = "trigger_removed"
|
||||
TRIGGER_UPDATED = "trigger_updated"
|
||||
|
||||
# Task system lifecycle (per-list diffs streamed to the UI)
|
||||
TASK_CREATED = "task_created"
|
||||
TASK_UPDATED = "task_updated"
|
||||
TASK_DELETED = "task_deleted"
|
||||
TASK_LIST_RESET = "task_list_reset"
|
||||
TASK_LIST_REATTACH_MISMATCH = "task_list_reattach_mismatch"
|
||||
COLONY_TEMPLATE_ASSIGNMENT = "colony_template_assignment"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AgentEvent:
|
||||
@@ -809,16 +819,28 @@ class EventBus:
|
||||
input_tokens: int,
|
||||
output_tokens: int,
|
||||
cached_tokens: int = 0,
|
||||
cache_creation_tokens: int = 0,
|
||||
cost_usd: float = 0.0,
|
||||
execution_id: str | None = None,
|
||||
iteration: int | None = None,
|
||||
) -> None:
|
||||
"""Emit LLM turn completion with stop reason and model metadata."""
|
||||
"""Emit LLM turn completion with stop reason and model metadata.
|
||||
|
||||
``cached_tokens`` and ``cache_creation_tokens`` are subsets of
|
||||
``input_tokens`` (already inside provider ``prompt_tokens``).
|
||||
Subscribers should display them, not add them to a total.
|
||||
|
||||
``cost_usd`` is the USD cost for this turn when known (Anthropic,
|
||||
OpenAI, OpenRouter). 0.0 means unreported (not free).
|
||||
"""
|
||||
data: dict = {
|
||||
"stop_reason": stop_reason,
|
||||
"model": model,
|
||||
"input_tokens": input_tokens,
|
||||
"output_tokens": output_tokens,
|
||||
"cached_tokens": cached_tokens,
|
||||
"cache_creation_tokens": cache_creation_tokens,
|
||||
"cost_usd": cost_usd,
|
||||
}
|
||||
if iteration is not None:
|
||||
data["iteration"] = iteration
|
||||
@@ -914,24 +936,22 @@ class EventBus:
|
||||
self,
|
||||
stream_id: str,
|
||||
node_id: str,
|
||||
prompt: str = "",
|
||||
execution_id: str | None = None,
|
||||
options: list[str] | None = None,
|
||||
questions: list[dict] | None = None,
|
||||
) -> None:
|
||||
"""Emit a user-input request for interactive queen turns.
|
||||
|
||||
Args:
|
||||
options: Optional predefined choices for the user (1-3 items).
|
||||
The frontend appends an "Other" free-text option
|
||||
automatically.
|
||||
questions: Optional list of question dicts for multi-question
|
||||
batches (from ask_user_multiple). Each dict has id,
|
||||
prompt, and optional options.
|
||||
questions: Optional list of question dicts from ``ask_user``.
|
||||
Each dict has ``id``, ``prompt``, and optional ``options``
|
||||
(2-3 predefined choices). The frontend renders the
|
||||
QuestionWidget for a single-entry list and the
|
||||
MultiQuestionWidget for 2+ entries. Free-text asks (no
|
||||
options) stream the prompt separately as a chat message;
|
||||
auto-block turns have no questions at all and fall back
|
||||
to the normal text input.
|
||||
"""
|
||||
data: dict[str, Any] = {"prompt": prompt}
|
||||
if options:
|
||||
data["options"] = options
|
||||
data: dict[str, Any] = {}
|
||||
if questions:
|
||||
data["questions"] = questions
|
||||
await self.publish(
|
||||
|
||||
@@ -7,7 +7,7 @@ verify SOP gates before marking a task done. This gives cross-run memory
|
||||
that the existing per-iteration stall detectors don't have.
|
||||
|
||||
The DB is driven by agents via the ``sqlite3`` CLI through
|
||||
``execute_command_tool``. This module handles framework-side lifecycle:
|
||||
``terminal_exec``. This module handles framework-side lifecycle:
|
||||
creation, migration, queen-side bulk seeding, stale-claim reclamation.
|
||||
|
||||
Concurrency model:
|
||||
@@ -264,7 +264,9 @@ def ensure_all_colony_dbs(colonies_root: Path | None = None) -> list[Path]:
|
||||
run the stale-claim reclaimer on all of them in one pass.
|
||||
"""
|
||||
if colonies_root is None:
|
||||
colonies_root = Path.home() / ".hive" / "colonies"
|
||||
from framework.config import COLONIES_DIR
|
||||
|
||||
colonies_root = COLONIES_DIR
|
||||
if not colonies_root.is_dir():
|
||||
return []
|
||||
|
||||
|
||||
@@ -154,11 +154,21 @@ class Worker:
|
||||
# value without affecting the queen's ongoing calls.
|
||||
try:
|
||||
from framework.loader.tool_registry import ToolRegistry
|
||||
from framework.tasks.scoping import session_task_list_id
|
||||
|
||||
ToolRegistry.set_execution_context(profile=self.id)
|
||||
ctx = self._context
|
||||
agent_id = getattr(ctx, "agent_id", None) or self.id
|
||||
list_id = getattr(ctx, "task_list_id", None) or session_task_list_id(agent_id, self.id)
|
||||
ToolRegistry.set_execution_context(
|
||||
profile=self.id,
|
||||
agent_id=agent_id,
|
||||
task_list_id=list_id,
|
||||
colony_id=getattr(ctx, "colony_id", None),
|
||||
picked_up_from=getattr(ctx, "picked_up_from", None),
|
||||
)
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"Worker %s: failed to scope browser profile",
|
||||
"Worker %s: failed to scope execution context",
|
||||
self.id,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
@@ -23,6 +23,7 @@ from collections.abc import AsyncIterator, Callable, Iterator
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from framework.config import HIVE_HOME as _HIVE_HOME
|
||||
from framework.llm.provider import LLMProvider, LLMResponse, Tool
|
||||
from framework.llm.stream_events import (
|
||||
FinishEvent,
|
||||
@@ -50,8 +51,8 @@ _ENDPOINTS = [
|
||||
_DEFAULT_PROJECT_ID = "rising-fact-p41fc"
|
||||
_TOKEN_REFRESH_BUFFER_SECS = 60
|
||||
|
||||
# Credentials file in ~/.hive/ (native implementation)
|
||||
_ACCOUNTS_FILE = Path.home() / ".hive" / "antigravity-accounts.json"
|
||||
# Credentials file in $HIVE_HOME (native implementation)
|
||||
_ACCOUNTS_FILE = _HIVE_HOME / "antigravity-accounts.json"
|
||||
_IDE_STATE_DB_MAC = (
|
||||
Path.home() / "Library" / "Application Support" / "Antigravity" / "User" / "globalStorage" / "state.vscdb"
|
||||
)
|
||||
@@ -60,10 +61,12 @@ _IDE_STATE_DB_KEY = "antigravityUnifiedStateSync.oauthToken"
|
||||
|
||||
_BASE_HEADERS: dict[str, str] = {
|
||||
# Mimic the Antigravity Electron app so the API accepts the request.
|
||||
# Google deprecates older client versions over time, so this needs periodic
|
||||
# bumping to match whatever the current Antigravity desktop release advertises.
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Antigravity/1.18.3 Chrome/138.0.7204.235 "
|
||||
"Electron/37.3.1 Safari/537.36"
|
||||
"(KHTML, like Gecko) Antigravity/1.23.2 Chrome/138.0.7204.235 "
|
||||
"Electron/39.2.3 Safari/537.36"
|
||||
),
|
||||
"X-Goog-Api-Client": "google-cloud-sdk vscode_cloudshelleditor/0.1",
|
||||
"Client-Metadata": '{"ideType":"ANTIGRAVITY","platform":"MACOS","pluginType":"GEMINI"}',
|
||||
@@ -253,6 +256,56 @@ def _clean_tool_name(name: str) -> str:
|
||||
return name[:64]
|
||||
|
||||
|
||||
def _sanitize_schema_for_gemini(schema: Any) -> Any:
|
||||
"""Convert JSON Schema 2020-12 features to the OpenAPI 3.0 dialect Gemini accepts.
|
||||
|
||||
Gemini's function_declarations parser rejects union ``"type": ["string", "null"]``.
|
||||
Translate any such union to a single type plus ``"nullable": true``. Recurse into
|
||||
``properties``, ``items``, and the ``anyOf``/``oneOf``/``allOf`` combinators.
|
||||
"""
|
||||
if isinstance(schema, list):
|
||||
return [_sanitize_schema_for_gemini(s) for s in schema]
|
||||
if not isinstance(schema, dict):
|
||||
return schema
|
||||
|
||||
out = dict(schema)
|
||||
t = out.get("type")
|
||||
if isinstance(t, list):
|
||||
non_null = [x for x in t if x != "null"]
|
||||
has_null = "null" in t
|
||||
if len(non_null) == 1:
|
||||
out["type"] = non_null[0]
|
||||
if has_null:
|
||||
out["nullable"] = True
|
||||
elif not non_null and has_null:
|
||||
# Pure null type: fall back to string-nullable.
|
||||
out["type"] = "string"
|
||||
out["nullable"] = True
|
||||
else:
|
||||
# Multi-type non-null unions (e.g. ["string", "integer", "null"])
|
||||
# have no faithful Gemini equivalent. Silently picking one type
|
||||
# changes the contract for callers who rely on the others, so
|
||||
# fail loud and let the schema author rewrite it as anyOf or
|
||||
# narrow to a single type.
|
||||
raise ValueError(
|
||||
f"Unsupported Gemini schema union: {t!r}. "
|
||||
"Gemini accepts a single primitive type plus optional 'nullable: true'. "
|
||||
"Rewrite as anyOf or pick a single type."
|
||||
)
|
||||
|
||||
if "properties" in out and isinstance(out["properties"], dict):
|
||||
out["properties"] = {k: _sanitize_schema_for_gemini(v) for k, v in out["properties"].items()}
|
||||
if "items" in out:
|
||||
out["items"] = _sanitize_schema_for_gemini(out["items"])
|
||||
if "additionalProperties" in out and isinstance(out["additionalProperties"], dict):
|
||||
out["additionalProperties"] = _sanitize_schema_for_gemini(out["additionalProperties"])
|
||||
for combinator in ("anyOf", "oneOf", "allOf"):
|
||||
if combinator in out:
|
||||
out[combinator] = _sanitize_schema_for_gemini(out[combinator])
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def _to_gemini_contents(
|
||||
messages: list[dict[str, Any]],
|
||||
thought_sigs: dict[str, str] | None = None,
|
||||
@@ -554,11 +607,13 @@ class AntigravityProvider(LLMProvider):
|
||||
{
|
||||
"name": _clean_tool_name(t.name),
|
||||
"description": t.description,
|
||||
"parameters": t.parameters
|
||||
or {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
},
|
||||
"parameters": _sanitize_schema_for_gemini(
|
||||
t.parameters
|
||||
or {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
}
|
||||
),
|
||||
}
|
||||
for t in tools
|
||||
]
|
||||
@@ -653,10 +708,17 @@ class AntigravityProvider(LLMProvider):
|
||||
system: str = "",
|
||||
tools: list[Tool] | None = None,
|
||||
max_tokens: int = 4096,
|
||||
system_dynamic_suffix: str | None = None,
|
||||
) -> AsyncIterator[StreamEvent]:
|
||||
import asyncio # noqa: PLC0415
|
||||
import concurrent.futures # noqa: PLC0415
|
||||
|
||||
# Antigravity (Google's proprietary endpoint) doesn't expose a
|
||||
# cache_control hook. Concatenate the dynamic suffix so its shape
|
||||
# matches the legacy single-string call site.
|
||||
if system_dynamic_suffix:
|
||||
system = f"{system}\n\n{system_dynamic_suffix}" if system else system_dynamic_suffix
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
queue: asyncio.Queue[StreamEvent | None] = asyncio.Queue()
|
||||
|
||||
|
||||
@@ -1,114 +1,32 @@
|
||||
"""Model capability checks for LLM providers.
|
||||
|
||||
Vision support rules are derived from official vendor documentation:
|
||||
- ZAI (z.ai): docs.z.ai/guides/vlm — GLM-4.6V variants are vision; GLM-5/4.6/4.7 are text-only
|
||||
- MiniMax: platform.minimax.io/docs — minimax-vl-01 is vision; M2.x are text-only
|
||||
- DeepSeek: api-docs.deepseek.com — deepseek-vl2 is vision; chat/reasoner are text-only
|
||||
- Cerebras: inference-docs.cerebras.ai — no vision models at all
|
||||
- Groq: console.groq.com/docs/vision — vision capable; treat as supported by default
|
||||
- Ollama/LM Studio/vLLM/llama.cpp: local runners denied by default; model names
|
||||
don't reliably indicate vision support, so users must configure explicitly
|
||||
Vision support is sourced from the curated ``model_catalog.json``. Each model
|
||||
entry carries an optional ``supports_vision`` boolean; unknown models default
|
||||
to vision-capable so hosted frontier models work out of the box. To toggle
|
||||
support for a model, edit its catalog entry rather than this file.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from framework.llm.model_catalog import model_supports_vision
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from framework.llm.provider import Tool
|
||||
|
||||
|
||||
def _model_name(model: str) -> str:
|
||||
"""Return the bare model name after stripping any 'provider/' prefix."""
|
||||
if "/" in model:
|
||||
return model.split("/", 1)[1]
|
||||
return model
|
||||
|
||||
|
||||
# Step 1: explicit vision allow-list — these always support images regardless
|
||||
# of what the provider-level rules say. Checked first so that e.g. glm-4.6v
|
||||
# is allowed even though glm-4.6 is denied.
|
||||
_VISION_ALLOW_BARE_PREFIXES: tuple[str, ...] = (
|
||||
# ZAI/GLM vision models (docs.z.ai/guides/vlm)
|
||||
"glm-4v", # GLM-4V series (legacy)
|
||||
"glm-4.6v", # GLM-4.6V, GLM-4.6V-flash, GLM-4.6V-flashx
|
||||
# DeepSeek vision models
|
||||
"deepseek-vl", # deepseek-vl2, deepseek-vl2-small, deepseek-vl2-tiny
|
||||
# MiniMax vision model
|
||||
"minimax-vl", # minimax-vl-01
|
||||
)
|
||||
|
||||
# Step 2: provider-level deny — every model from this provider is text-only.
|
||||
_TEXT_ONLY_PROVIDER_PREFIXES: tuple[str, ...] = (
|
||||
# Cerebras: inference-docs.cerebras.ai lists only text models
|
||||
"cerebras/",
|
||||
# Local runners: model names don't reliably indicate vision support
|
||||
"ollama/",
|
||||
"ollama_chat/",
|
||||
"lm_studio/",
|
||||
"vllm/",
|
||||
"llamacpp/",
|
||||
)
|
||||
|
||||
# Step 3: per-model deny — text-only models within otherwise mixed providers.
|
||||
# Matched against the bare model name (provider prefix stripped, lower-cased).
|
||||
# The vision allow-list above is checked first, so vision variants of the same
|
||||
# family are already handled before these deny patterns are reached.
|
||||
_TEXT_ONLY_MODEL_BARE_PREFIXES: tuple[str, ...] = (
|
||||
# --- ZAI / GLM family ---
|
||||
# text-only: glm-5, glm-4.6, glm-4.7, glm-4.5, zai-glm-*
|
||||
# vision: glm-4v, glm-4.6v (caught by allow-list above)
|
||||
"glm-5",
|
||||
"glm-4.6", # bare glm-4.6 is text-only; glm-4.6v is caught by allow-list
|
||||
"glm-4.7",
|
||||
"glm-4.5",
|
||||
"zai-glm",
|
||||
# --- DeepSeek ---
|
||||
# text-only: deepseek-chat, deepseek-coder, deepseek-reasoner
|
||||
# vision: deepseek-vl2 (caught by allow-list above)
|
||||
# Note: LiteLLM's deepseek handler may flatten content lists for some models;
|
||||
# VL models are allowed through and rely on LiteLLM's native VL support.
|
||||
"deepseek-chat",
|
||||
"deepseek-coder",
|
||||
"deepseek-reasoner",
|
||||
# --- MiniMax ---
|
||||
# text-only: minimax-m2.*, minimax-text-*, abab* (legacy)
|
||||
# vision: minimax-vl-01 (caught by allow-list above)
|
||||
"minimax-m2",
|
||||
"minimax-text",
|
||||
"abab",
|
||||
)
|
||||
|
||||
|
||||
def supports_image_tool_results(model: str) -> bool:
|
||||
"""Return whether *model* can receive image content in messages.
|
||||
|
||||
Used to gate both user-message images and tool-result image blocks.
|
||||
|
||||
Logic (checked in order):
|
||||
1. Vision allow-list → True (known vision model, skip all denies)
|
||||
2. Provider deny → False (entire provider is text-only)
|
||||
3. Model deny → False (specific text-only model within a mixed provider)
|
||||
4. Default → True (assume capable; unknown providers and models)
|
||||
Thin wrapper over :func:`model_supports_vision` so existing call sites
|
||||
keep working. Used to gate both user-message images and tool-result
|
||||
image blocks. Empty model strings are treated as capable so the default
|
||||
code path doesn't strip images before a provider is selected.
|
||||
"""
|
||||
model_lower = model.lower()
|
||||
bare = _model_name(model_lower)
|
||||
|
||||
# 1. Explicit vision allow — takes priority over all denies
|
||||
if any(bare.startswith(p) for p in _VISION_ALLOW_BARE_PREFIXES):
|
||||
if not model:
|
||||
return True
|
||||
|
||||
# 2. Provider-level deny (all models from this provider are text-only)
|
||||
if any(model_lower.startswith(p) for p in _TEXT_ONLY_PROVIDER_PREFIXES):
|
||||
return False
|
||||
|
||||
# 3. Per-model deny (text-only variants within mixed-capability families)
|
||||
if any(bare.startswith(p) for p in _TEXT_ONLY_MODEL_BARE_PREFIXES):
|
||||
return False
|
||||
|
||||
# 5. Default: assume vision capable
|
||||
# Covers: OpenAI, Anthropic, Google, Mistral, Kimi, and other hosted providers
|
||||
return True
|
||||
return model_supports_vision(model)
|
||||
|
||||
|
||||
def filter_tools_for_model(tools: list[Tool], model: str) -> tuple[list[Tool], list[str]]:
|
||||
|
||||
+454
-41
@@ -33,6 +33,7 @@ except ImportError:
|
||||
RateLimitError = Exception # type: ignore[assignment, misc]
|
||||
|
||||
from framework.config import HIVE_LLM_ENDPOINT as HIVE_API_BASE
|
||||
from framework.llm.model_catalog import get_model_pricing
|
||||
from framework.llm.provider import LLMProvider, LLMResponse, Tool
|
||||
from framework.llm.stream_events import StreamEvent
|
||||
|
||||
@@ -43,6 +44,30 @@ logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def _api_base_needs_bearer_auth(api_base: str | None) -> bool:
|
||||
"""Return True when api_base points at an Anthropic-compatible endpoint
|
||||
that authenticates via ``Authorization: Bearer`` rather than ``x-api-key``.
|
||||
|
||||
The Hive LLM proxy (Rust service in hive-backend/llm/) speaks the
|
||||
Anthropic Messages API but mints user-scoped JWTs and validates them
|
||||
via Bearer auth. Default upstream Anthropic endpoints (api.anthropic.com,
|
||||
Kimi's api.kimi.com/coding) keep using x-api-key, so the override is
|
||||
scoped to known hive-proxy hosts plus the env-configured override.
|
||||
"""
|
||||
if not api_base:
|
||||
return False
|
||||
# Strip protocol, port, and path so a plain hostname compare is enough
|
||||
# for the common cases.
|
||||
lowered = api_base.lower()
|
||||
for host in ("adenhq.com", "open-hive.com", "127.0.0.1:8890", "localhost:8890"):
|
||||
if host in lowered:
|
||||
return True
|
||||
override = os.environ.get("HIVE_LLM_BASE_URL")
|
||||
if override and override.lower() in lowered:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _patch_litellm_anthropic_oauth() -> None:
|
||||
"""Patch litellm's Anthropic header construction to fix OAuth token handling.
|
||||
|
||||
@@ -186,6 +211,44 @@ def _ensure_ollama_chat_prefix(model: str) -> str:
|
||||
return model
|
||||
|
||||
|
||||
def rewrite_proxy_model(
|
||||
model: str, api_key: str | None, api_base: str | None
|
||||
) -> tuple[str, str | None, dict[str, str]]:
|
||||
"""Apply Hive/Kimi proxy rewrites for any caller of ``litellm.acompletion``.
|
||||
|
||||
Both the Hive LLM proxy and Kimi For Coding expose Anthropic-API-
|
||||
compatible endpoints. LiteLLM doesn't recognise the ``hive/`` or
|
||||
``kimi/`` prefixes natively, so we rewrite them to ``anthropic/``
|
||||
here. For the Hive proxy we also stamp a Bearer token into
|
||||
``extra_headers`` because litellm's Anthropic handler only sends
|
||||
``x-api-key`` and the proxy expects ``Authorization: Bearer``.
|
||||
|
||||
Used by ad-hoc ``litellm.acompletion`` callers (e.g. the vision-
|
||||
fallback subagent in ``caption_tool_image``) so they hit the same
|
||||
proxy with the same auth as the main agent's ``LiteLLMProvider``.
|
||||
The provider's own ``__init__`` keeps its inlined rewrite for now —
|
||||
this helper is the single source of truth for ad-hoc callers.
|
||||
|
||||
Returns: (rewritten_model, normalised_api_base, extra_headers).
|
||||
The ``extra_headers`` dict is non-empty only for the Hive proxy
|
||||
(and only when ``api_key`` is provided).
|
||||
"""
|
||||
extra_headers: dict[str, str] = {}
|
||||
if model.lower().startswith("kimi/"):
|
||||
model = "anthropic/" + model[len("kimi/") :]
|
||||
if api_base and api_base.rstrip("/").endswith("/v1"):
|
||||
api_base = api_base.rstrip("/")[:-3]
|
||||
elif model.lower().startswith("hive/"):
|
||||
model = "anthropic/" + model[len("hive/") :]
|
||||
if api_base and api_base.rstrip("/").endswith("/v1"):
|
||||
api_base = api_base.rstrip("/")[:-3]
|
||||
# Hive proxy expects Bearer auth; litellm's Anthropic handler
|
||||
# only sends x-api-key without this nudge.
|
||||
if api_key:
|
||||
extra_headers["Authorization"] = f"Bearer {api_key}"
|
||||
return model, api_base, extra_headers
|
||||
|
||||
|
||||
RATE_LIMIT_MAX_RETRIES = 10
|
||||
RATE_LIMIT_BACKOFF_BASE = 2 # seconds
|
||||
RATE_LIMIT_MAX_DELAY = 120 # seconds - cap to prevent absurd waits
|
||||
@@ -213,9 +276,72 @@ _CACHE_CONTROL_PREFIXES = (
|
||||
"glm-",
|
||||
)
|
||||
|
||||
# OpenRouter sub-provider prefixes whose upstream API honors `cache_control`.
|
||||
# OpenRouter passes the marker through to the underlying provider for these.
|
||||
# (See https://openrouter.ai/docs/guides/best-practices/prompt-caching.)
|
||||
# OpenAI/DeepSeek/Groq/Grok/Moonshot route through OpenRouter but cache
|
||||
# automatically server-side — sending cache_control there is a no-op, not a
|
||||
# win, and they need a separate prefix-stability fix to actually get hits.
|
||||
_OPENROUTER_CACHE_CONTROL_PREFIXES = (
|
||||
"openrouter/anthropic/",
|
||||
"openrouter/google/gemini-",
|
||||
"openrouter/z-ai/glm",
|
||||
"openrouter/minimax/",
|
||||
)
|
||||
|
||||
|
||||
def _model_supports_cache_control(model: str) -> bool:
|
||||
return any(model.startswith(p) for p in _CACHE_CONTROL_PREFIXES)
|
||||
if any(model.startswith(p) for p in _CACHE_CONTROL_PREFIXES):
|
||||
return True
|
||||
return any(model.startswith(p) for p in _OPENROUTER_CACHE_CONTROL_PREFIXES)
|
||||
|
||||
|
||||
def _build_system_message(
|
||||
system: str,
|
||||
system_dynamic_suffix: str | None,
|
||||
model: str,
|
||||
) -> dict[str, Any] | None:
|
||||
"""Construct the system-role message for the chat completion.
|
||||
|
||||
Returns ``None`` when there is nothing to send.
|
||||
|
||||
Two-block split path — used when the caller supplied a non-empty
|
||||
``system_dynamic_suffix`` AND the provider honors ``cache_control``
|
||||
(Anthropic, MiniMax, Z-AI/GLM). We emit ``content`` as a list of two
|
||||
text blocks with an ephemeral ``cache_control`` marker on the first
|
||||
block only. The prompt cache keeps the static prefix warm across
|
||||
turns and across iterations within a turn; only the small dynamic
|
||||
tail is recomputed on every request.
|
||||
|
||||
Single-string path — used for every other case (no suffix provided,
|
||||
or provider doesn't honor ``cache_control``). We concatenate
|
||||
``system`` + ``\\n\\n`` + ``system_dynamic_suffix`` and attach
|
||||
``cache_control`` to the whole message when the provider supports
|
||||
it. This is byte-identical to the pre-split behavior for all
|
||||
non-cache-control providers (OpenAI, Gemini, Groq, Ollama, etc.).
|
||||
"""
|
||||
if not system and not system_dynamic_suffix:
|
||||
return None
|
||||
if system_dynamic_suffix and _model_supports_cache_control(model):
|
||||
content_blocks: list[dict[str, Any]] = []
|
||||
if system:
|
||||
content_blocks.append(
|
||||
{
|
||||
"type": "text",
|
||||
"text": system,
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
)
|
||||
content_blocks.append({"type": "text", "text": system_dynamic_suffix})
|
||||
return {"role": "system", "content": content_blocks}
|
||||
# Single-string path (legacy or no-cache-control provider).
|
||||
combined = system
|
||||
if system_dynamic_suffix:
|
||||
combined = f"{system}\n\n{system_dynamic_suffix}" if system else system_dynamic_suffix
|
||||
sys_msg: dict[str, Any] = {"role": "system", "content": combined}
|
||||
if _model_supports_cache_control(model):
|
||||
sys_msg["cache_control"] = {"type": "ephemeral"}
|
||||
return sys_msg
|
||||
|
||||
|
||||
# Kimi For Coding uses an Anthropic-compatible endpoint (no /v1 suffix).
|
||||
@@ -289,14 +415,186 @@ OPENROUTER_TOOL_COMPAT_MODEL_CACHE: dict[str, float] = {}
|
||||
# from rate-limit retries — 3 retries is sufficient for connection failures.
|
||||
STREAM_TRANSIENT_MAX_RETRIES = 3
|
||||
|
||||
# Directory for dumping failed requests
|
||||
FAILED_REQUESTS_DIR = Path.home() / ".hive" / "failed_requests"
|
||||
|
||||
# Maximum number of dump files to retain in ~/.hive/failed_requests/.
|
||||
# Directory for dumping failed requests. Resolved lazily so HIVE_HOME
|
||||
# overrides (set by the desktop shell) take effect even if this module
|
||||
# is imported before framework.config picks up the override.
|
||||
def _failed_requests_dir() -> Path:
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
return HIVE_HOME / "failed_requests"
|
||||
|
||||
|
||||
# Maximum number of dump files to retain in $HIVE_HOME/failed_requests/.
|
||||
# Older files are pruned automatically to prevent unbounded disk growth.
|
||||
MAX_FAILED_REQUEST_DUMPS = 50
|
||||
|
||||
|
||||
def _cost_from_catalog_pricing(
|
||||
model: str,
|
||||
input_tokens: int,
|
||||
output_tokens: int,
|
||||
cached_tokens: int = 0,
|
||||
cache_creation_tokens: int = 0,
|
||||
) -> float:
|
||||
"""Last-resort cost calculation using curated catalog pricing.
|
||||
|
||||
Consulted only when the provider response carries no native cost and
|
||||
LiteLLM's own catalog has no pricing for ``model``. Reads
|
||||
``pricing_usd_per_mtok`` from ``model_catalog.json``. Rates are USD per
|
||||
million tokens.
|
||||
|
||||
``cached_tokens`` and ``cache_creation_tokens`` are subsets of
|
||||
``input_tokens`` (see ``_extract_cache_tokens``), so subtract them from
|
||||
the base input count to avoid double-billing. If a cache rate is absent,
|
||||
fall back to the plain input rate.
|
||||
"""
|
||||
if not model or (input_tokens == 0 and output_tokens == 0):
|
||||
return 0.0
|
||||
pricing = get_model_pricing(model)
|
||||
if pricing is None and "/" in model:
|
||||
# LiteLLM prefixes some ids (e.g. "openrouter/z-ai/glm-5.1"); the
|
||||
# catalog stores the bare form ("z-ai/glm-5.1"). Strip one segment.
|
||||
pricing = get_model_pricing(model.split("/", 1)[1])
|
||||
if pricing is None:
|
||||
return 0.0
|
||||
|
||||
per_mtok_in = pricing.get("input", 0.0)
|
||||
per_mtok_out = pricing.get("output", 0.0)
|
||||
per_mtok_cache_read = pricing.get("cache_read", per_mtok_in)
|
||||
per_mtok_cache_write = pricing.get("cache_creation", per_mtok_in)
|
||||
|
||||
plain_input = max(input_tokens - cached_tokens - cache_creation_tokens, 0)
|
||||
total = (
|
||||
plain_input * per_mtok_in
|
||||
+ cached_tokens * per_mtok_cache_read
|
||||
+ cache_creation_tokens * per_mtok_cache_write
|
||||
+ output_tokens * per_mtok_out
|
||||
) / 1_000_000
|
||||
return float(total) if total > 0 else 0.0
|
||||
|
||||
|
||||
def _extract_cost(response: Any, model: str) -> float:
|
||||
"""Pull the USD cost for a non-streaming completion response.
|
||||
|
||||
Sources checked, in priority order:
|
||||
1. ``usage.cost`` — populated when OpenRouter returns native cost via
|
||||
``usage: {include: true}`` or when ``litellm.include_cost_in_streaming_usage``
|
||||
is on.
|
||||
2. ``response._hidden_params["response_cost"]`` — set by LiteLLM's
|
||||
logging layer after most successful completions.
|
||||
3. ``litellm.completion_cost(...)`` — computes from the model pricing
|
||||
table; works across Anthropic, OpenAI, and OpenRouter as long as the
|
||||
model is in LiteLLM's catalog.
|
||||
4. ``pricing_usd_per_mtok`` from the curated model catalog — covers
|
||||
models (e.g. GLM, Kimi, MiniMax) that LiteLLM doesn't price.
|
||||
|
||||
Returns 0.0 for unpriced models or unexpected response shapes — cost is a
|
||||
display concern, never let it break the hot path. For streaming paths
|
||||
where the aggregate response isn't a full ``ModelResponse``, use
|
||||
:func:`_cost_from_tokens` with the already-extracted token counts.
|
||||
"""
|
||||
if response is None:
|
||||
return 0.0
|
||||
usage = getattr(response, "usage", None)
|
||||
usage_cost = getattr(usage, "cost", None) if usage is not None else None
|
||||
if isinstance(usage_cost, (int, float)) and usage_cost > 0:
|
||||
return float(usage_cost)
|
||||
|
||||
hidden = getattr(response, "_hidden_params", None)
|
||||
if isinstance(hidden, dict):
|
||||
hp_cost = hidden.get("response_cost")
|
||||
if isinstance(hp_cost, (int, float)) and hp_cost > 0:
|
||||
return float(hp_cost)
|
||||
|
||||
try:
|
||||
import litellm as _litellm
|
||||
|
||||
computed = _litellm.completion_cost(completion_response=response, model=model)
|
||||
if isinstance(computed, (int, float)) and computed > 0:
|
||||
return float(computed)
|
||||
except Exception as exc:
|
||||
logger.debug("[cost] completion_cost failed for %s: %s", model, exc)
|
||||
|
||||
if usage is not None:
|
||||
input_tokens = int(getattr(usage, "prompt_tokens", 0) or 0)
|
||||
output_tokens = int(getattr(usage, "completion_tokens", 0) or 0)
|
||||
cache_read, cache_creation = _extract_cache_tokens(usage)
|
||||
fallback = _cost_from_catalog_pricing(model, input_tokens, output_tokens, cache_read, cache_creation)
|
||||
if fallback > 0:
|
||||
return fallback
|
||||
return 0.0
|
||||
|
||||
|
||||
def _cost_from_tokens(
|
||||
model: str,
|
||||
input_tokens: int,
|
||||
output_tokens: int,
|
||||
cached_tokens: int = 0,
|
||||
cache_creation_tokens: int = 0,
|
||||
) -> float:
|
||||
"""Compute USD cost from already-normalized token counts.
|
||||
|
||||
Used on streaming paths where the aggregate ``response`` is the stream
|
||||
wrapper (not a full ``ModelResponse``) and ``litellm.completion_cost`` on
|
||||
it either no-ops or raises. Calls ``litellm.cost_per_token`` directly
|
||||
with the cache-aware inputs so Anthropic's 5-min-write / cache-read
|
||||
multipliers are applied correctly.
|
||||
"""
|
||||
if not model or (input_tokens == 0 and output_tokens == 0):
|
||||
return 0.0
|
||||
try:
|
||||
import litellm as _litellm
|
||||
|
||||
prompt_cost, completion_cost = _litellm.cost_per_token(
|
||||
model=model,
|
||||
prompt_tokens=input_tokens,
|
||||
completion_tokens=output_tokens,
|
||||
cache_read_input_tokens=cached_tokens,
|
||||
cache_creation_input_tokens=cache_creation_tokens,
|
||||
)
|
||||
total = (prompt_cost or 0.0) + (completion_cost or 0.0)
|
||||
if total > 0:
|
||||
return float(total)
|
||||
except Exception as exc:
|
||||
logger.debug("[cost] cost_per_token failed for %s: %s", model, exc)
|
||||
return _cost_from_catalog_pricing(model, input_tokens, output_tokens, cached_tokens, cache_creation_tokens)
|
||||
|
||||
|
||||
def _extract_cache_tokens(usage: Any) -> tuple[int, int]:
|
||||
"""Pull (cache_read, cache_creation) from a LiteLLM usage object.
|
||||
|
||||
Both are subsets of ``prompt_tokens`` already — providers count them
|
||||
inside the input total. Surface separately for visibility, never sum.
|
||||
|
||||
Field names vary by provider/proxy; check the known shapes in priority
|
||||
order and fall back to 0:
|
||||
|
||||
cache_read:
|
||||
- ``prompt_tokens_details.cached_tokens`` — OpenAI-shape; also what
|
||||
LiteLLM normalizes Anthropic and OpenRouter into.
|
||||
- ``cache_read_input_tokens`` — raw Anthropic field name.
|
||||
|
||||
cache_creation:
|
||||
- ``prompt_tokens_details.cache_write_tokens`` — OpenRouter's
|
||||
normalized field for cache writes (verified empirically against
|
||||
``openrouter/anthropic/*`` and ``openrouter/z-ai/*`` responses).
|
||||
- ``cache_creation_input_tokens`` — raw Anthropic top-level field.
|
||||
"""
|
||||
if not usage:
|
||||
return 0, 0
|
||||
_details = getattr(usage, "prompt_tokens_details", None)
|
||||
cache_read = (
|
||||
getattr(_details, "cached_tokens", 0) or 0
|
||||
if _details is not None
|
||||
else getattr(usage, "cache_read_input_tokens", 0) or 0
|
||||
)
|
||||
cache_creation = (getattr(_details, "cache_write_tokens", 0) or 0 if _details is not None else 0) or (
|
||||
getattr(usage, "cache_creation_input_tokens", 0) or 0
|
||||
)
|
||||
return cache_read, cache_creation
|
||||
|
||||
|
||||
def _estimate_tokens(model: str, messages: list[dict]) -> tuple[int, str]:
|
||||
"""Estimate token count for messages. Returns (token_count, method)."""
|
||||
# Try litellm's token counter first
|
||||
@@ -319,7 +617,7 @@ def _prune_failed_request_dumps(max_files: int = MAX_FAILED_REQUEST_DUMPS) -> No
|
||||
"""
|
||||
try:
|
||||
all_dumps = sorted(
|
||||
FAILED_REQUESTS_DIR.glob("*.json"),
|
||||
_failed_requests_dir().glob("*.json"),
|
||||
key=lambda f: f.stat().st_mtime,
|
||||
)
|
||||
excess = len(all_dumps) - max_files
|
||||
@@ -354,11 +652,12 @@ def _dump_failed_request(
|
||||
) -> str:
|
||||
"""Dump failed request to a file for debugging. Returns the file path."""
|
||||
try:
|
||||
FAILED_REQUESTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
dump_dir = _failed_requests_dir()
|
||||
dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
||||
filename = f"{error_type}_{model.replace('/', '_')}_{timestamp}.json"
|
||||
filepath = FAILED_REQUESTS_DIR / filename
|
||||
filepath = dump_dir / filename
|
||||
|
||||
# Build dump data
|
||||
messages = kwargs.get("messages", [])
|
||||
@@ -388,7 +687,7 @@ def _dump_failed_request(
|
||||
|
||||
return str(filepath)
|
||||
except OSError as e:
|
||||
logger.warning(f"Failed to dump request debug log to {FAILED_REQUESTS_DIR}: {e}")
|
||||
logger.warning(f"Failed to dump request debug log to {_failed_requests_dir()}: {e}")
|
||||
return "log_write_failed"
|
||||
|
||||
|
||||
@@ -702,6 +1001,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
# Translate kimi/ prefix to anthropic/ so litellm uses the Anthropic
|
||||
# Messages API handler and routes to that endpoint — no special headers needed.
|
||||
_original_model = model
|
||||
self._hive_proxy_auth = bool(_original_model.lower().startswith("hive/"))
|
||||
if _is_ollama_model(model):
|
||||
model = _ensure_ollama_chat_prefix(model)
|
||||
elif model.lower().startswith("kimi/"):
|
||||
@@ -755,6 +1055,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
these attributes in-place propagates to all callers on the next LLM call.
|
||||
"""
|
||||
_original_model = model
|
||||
self._hive_proxy_auth = bool(_original_model.lower().startswith("hive/"))
|
||||
if _is_ollama_model(model):
|
||||
model = _ensure_ollama_chat_prefix(model)
|
||||
elif model.lower().startswith("kimi/"):
|
||||
@@ -994,6 +1295,16 @@ class LiteLLMProvider(LLMProvider):
|
||||
# Ollama requires explicit tool_choice=auto for function calling
|
||||
# so future readers don't have to guess.
|
||||
kwargs.setdefault("tool_choice", "auto")
|
||||
elif self._hive_proxy_auth:
|
||||
# The Hive LLM proxy fronts GLM, which drifts into "explain
|
||||
# the plan" mode on long-context turns instead of emitting
|
||||
# tool_use blocks (verified 2026-04-28: tool_choice=null →
|
||||
# text-only stop=stop; tool_choice=required → clean
|
||||
# tool_use). Force a tool call when tools are available
|
||||
# so queens can't get stuck in chat mode. Callers that
|
||||
# legitimately want a non-tool turn can override via
|
||||
# extra_kwargs.
|
||||
kwargs.setdefault("tool_choice", "required")
|
||||
|
||||
# Add response_format for structured output
|
||||
# LiteLLM passes this through to the underlying provider
|
||||
@@ -1015,12 +1326,17 @@ class LiteLLMProvider(LLMProvider):
|
||||
usage = response.usage
|
||||
input_tokens = usage.prompt_tokens if usage else 0
|
||||
output_tokens = usage.completion_tokens if usage else 0
|
||||
cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
|
||||
cost_usd = _extract_cost(response, self.model)
|
||||
|
||||
return LLMResponse(
|
||||
content=content,
|
||||
model=response.model or self.model,
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens,
|
||||
cached_tokens=cached_tokens,
|
||||
cache_creation_tokens=cache_creation_tokens,
|
||||
cost_usd=cost_usd,
|
||||
stop_reason=response.choices[0].finish_reason or "",
|
||||
raw_response=response,
|
||||
)
|
||||
@@ -1169,8 +1485,16 @@ class LiteLLMProvider(LLMProvider):
|
||||
response_format: dict[str, Any] | None = None,
|
||||
json_mode: bool = False,
|
||||
max_retries: int | None = None,
|
||||
system_dynamic_suffix: str | None = None,
|
||||
) -> LLMResponse:
|
||||
"""Async version of complete(). Uses litellm.acompletion — non-blocking."""
|
||||
"""Async version of complete(). Uses litellm.acompletion — non-blocking.
|
||||
|
||||
``system_dynamic_suffix`` is an optional per-turn tail. When set and
|
||||
the provider honors ``cache_control``, ``system`` is sent as the
|
||||
cached prefix and the suffix trails as an uncached second content
|
||||
block. Otherwise the two strings are concatenated into a single
|
||||
system message (legacy behavior).
|
||||
"""
|
||||
# Codex ChatGPT backend requires streaming — route through stream() which
|
||||
# already handles Codex quirks and has proper tool call accumulation.
|
||||
if self._codex_backend:
|
||||
@@ -1181,6 +1505,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
max_tokens=max_tokens,
|
||||
response_format=response_format,
|
||||
json_mode=json_mode,
|
||||
system_dynamic_suffix=system_dynamic_suffix,
|
||||
)
|
||||
return await self._collect_stream_to_response(stream_iter)
|
||||
|
||||
@@ -1188,10 +1513,8 @@ class LiteLLMProvider(LLMProvider):
|
||||
if self._claude_code_oauth:
|
||||
billing = _claude_code_billing_header(messages)
|
||||
full_messages.append({"role": "system", "content": billing})
|
||||
if system:
|
||||
sys_msg: dict[str, Any] = {"role": "system", "content": system}
|
||||
if _model_supports_cache_control(self.model):
|
||||
sys_msg["cache_control"] = {"type": "ephemeral"}
|
||||
sys_msg = _build_system_message(system, system_dynamic_suffix, self.model)
|
||||
if sys_msg is not None:
|
||||
full_messages.append(sys_msg)
|
||||
full_messages.extend(messages)
|
||||
|
||||
@@ -1219,6 +1542,10 @@ class LiteLLMProvider(LLMProvider):
|
||||
# Ollama requires explicit tool_choice=auto for function calling
|
||||
# so future readers don't have to guess.
|
||||
kwargs.setdefault("tool_choice", "auto")
|
||||
elif self._hive_proxy_auth:
|
||||
# See `complete()` for the rationale: GLM behind the Hive
|
||||
# proxy needs forcing or it goes chat-mode on long contexts.
|
||||
kwargs.setdefault("tool_choice", "required")
|
||||
if response_format:
|
||||
kwargs["response_format"] = response_format
|
||||
|
||||
@@ -1228,12 +1555,17 @@ class LiteLLMProvider(LLMProvider):
|
||||
usage = response.usage
|
||||
input_tokens = usage.prompt_tokens if usage else 0
|
||||
output_tokens = usage.completion_tokens if usage else 0
|
||||
cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
|
||||
cost_usd = _extract_cost(response, self.model)
|
||||
|
||||
return LLMResponse(
|
||||
content=content,
|
||||
model=response.model or self.model,
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens,
|
||||
cached_tokens=cached_tokens,
|
||||
cache_creation_tokens=cache_creation_tokens,
|
||||
cost_usd=cost_usd,
|
||||
stop_reason=response.choices[0].finish_reason or "",
|
||||
raw_response=response,
|
||||
)
|
||||
@@ -1619,6 +1951,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
messages: list[dict[str, Any]],
|
||||
system: str,
|
||||
tools: list[Tool],
|
||||
system_dynamic_suffix: str | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Build a JSON-only prompt for models without native tool support."""
|
||||
tool_specs = [
|
||||
@@ -1646,7 +1979,19 @@ class LiteLLMProvider(LLMProvider):
|
||||
)
|
||||
compat_system = compat_instruction if not system else f"{system}\n\n{compat_instruction}"
|
||||
|
||||
full_messages: list[dict[str, Any]] = [{"role": "system", "content": compat_system}]
|
||||
# If the routed sub-provider honors cache_control (e.g.
|
||||
# openrouter/anthropic/*), split the static prefix from the dynamic
|
||||
# suffix so the prefix stays cache-warm across turns. Otherwise fall
|
||||
# back to a single concatenated string.
|
||||
system_message = _build_system_message(
|
||||
compat_system,
|
||||
system_dynamic_suffix,
|
||||
self.model,
|
||||
)
|
||||
|
||||
full_messages: list[dict[str, Any]] = []
|
||||
if system_message is not None:
|
||||
full_messages.append(system_message)
|
||||
full_messages.extend(messages)
|
||||
return [
|
||||
message
|
||||
@@ -1660,9 +2005,21 @@ class LiteLLMProvider(LLMProvider):
|
||||
system: str,
|
||||
tools: list[Tool],
|
||||
max_tokens: int,
|
||||
system_dynamic_suffix: str | None = None,
|
||||
) -> LLMResponse:
|
||||
"""Emulate tool calling via JSON when OpenRouter rejects native tools."""
|
||||
full_messages = self._build_openrouter_tool_compat_messages(messages, system, tools)
|
||||
"""Emulate tool calling via JSON when OpenRouter rejects native tools.
|
||||
|
||||
When the routed sub-provider honors ``cache_control`` (e.g.
|
||||
``openrouter/anthropic/*``), the message builder splits the static
|
||||
prefix from the dynamic suffix so the prefix stays cache-warm.
|
||||
Otherwise the suffix is concatenated into a single system string.
|
||||
"""
|
||||
full_messages = self._build_openrouter_tool_compat_messages(
|
||||
messages,
|
||||
system,
|
||||
tools,
|
||||
system_dynamic_suffix=system_dynamic_suffix,
|
||||
)
|
||||
kwargs: dict[str, Any] = {
|
||||
"model": self.model,
|
||||
"messages": full_messages,
|
||||
@@ -1683,6 +2040,8 @@ class LiteLLMProvider(LLMProvider):
|
||||
usage = response.usage
|
||||
input_tokens = usage.prompt_tokens if usage else 0
|
||||
output_tokens = usage.completion_tokens if usage else 0
|
||||
cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
|
||||
cost_usd = _extract_cost(response, self.model)
|
||||
stop_reason = "tool_calls" if tool_calls else (response.choices[0].finish_reason or "stop")
|
||||
|
||||
return LLMResponse(
|
||||
@@ -1690,6 +2049,9 @@ class LiteLLMProvider(LLMProvider):
|
||||
model=response.model or self.model,
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens,
|
||||
cached_tokens=cached_tokens,
|
||||
cache_creation_tokens=cache_creation_tokens,
|
||||
cost_usd=cost_usd,
|
||||
stop_reason=stop_reason,
|
||||
raw_response={
|
||||
"compat_mode": "openrouter_tool_emulation",
|
||||
@@ -1704,6 +2066,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
system: str,
|
||||
tools: list[Tool],
|
||||
max_tokens: int,
|
||||
system_dynamic_suffix: str | None = None,
|
||||
) -> AsyncIterator[StreamEvent]:
|
||||
"""Fallback stream for OpenRouter models without native tool support."""
|
||||
from framework.llm.stream_events import (
|
||||
@@ -1724,6 +2087,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
system=system,
|
||||
tools=tools,
|
||||
max_tokens=max_tokens,
|
||||
system_dynamic_suffix=system_dynamic_suffix,
|
||||
)
|
||||
except Exception as e:
|
||||
yield StreamErrorEvent(error=str(e), recoverable=False)
|
||||
@@ -1747,6 +2111,9 @@ class LiteLLMProvider(LLMProvider):
|
||||
stop_reason=response.stop_reason,
|
||||
input_tokens=response.input_tokens,
|
||||
output_tokens=response.output_tokens,
|
||||
cached_tokens=response.cached_tokens,
|
||||
cache_creation_tokens=response.cache_creation_tokens,
|
||||
cost_usd=response.cost_usd,
|
||||
model=response.model,
|
||||
)
|
||||
|
||||
@@ -1758,6 +2125,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
max_tokens: int,
|
||||
response_format: dict[str, Any] | None,
|
||||
json_mode: bool,
|
||||
system_dynamic_suffix: str | None = None,
|
||||
) -> AsyncIterator[StreamEvent]:
|
||||
"""Fallback path: convert non-stream completion to stream events.
|
||||
|
||||
@@ -1781,6 +2149,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
max_tokens=max_tokens,
|
||||
response_format=response_format,
|
||||
json_mode=json_mode,
|
||||
system_dynamic_suffix=system_dynamic_suffix,
|
||||
)
|
||||
except Exception as e:
|
||||
yield StreamErrorEvent(error=str(e), recoverable=False)
|
||||
@@ -1812,6 +2181,9 @@ class LiteLLMProvider(LLMProvider):
|
||||
stop_reason=response.stop_reason or "stop",
|
||||
input_tokens=response.input_tokens,
|
||||
output_tokens=response.output_tokens,
|
||||
cached_tokens=response.cached_tokens,
|
||||
cache_creation_tokens=response.cache_creation_tokens,
|
||||
cost_usd=response.cost_usd,
|
||||
model=response.model,
|
||||
)
|
||||
|
||||
@@ -1823,6 +2195,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
max_tokens: int = 4096,
|
||||
response_format: dict[str, Any] | None = None,
|
||||
json_mode: bool = False,
|
||||
system_dynamic_suffix: str | None = None,
|
||||
) -> AsyncIterator[StreamEvent]:
|
||||
"""Stream a completion via litellm.acompletion(stream=True).
|
||||
|
||||
@@ -1833,6 +2206,9 @@ class LiteLLMProvider(LLMProvider):
|
||||
Empty responses (e.g. Gemini stealth rate-limits that return 200
|
||||
with no content) are retried with exponential backoff, mirroring
|
||||
the retry behaviour of ``_completion_with_rate_limit_retry``.
|
||||
|
||||
``system_dynamic_suffix`` is an optional per-turn tail. See
|
||||
``acomplete`` docstring for the two-block split semantics.
|
||||
"""
|
||||
from framework.llm.stream_events import (
|
||||
FinishEvent,
|
||||
@@ -1852,6 +2228,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
max_tokens=max_tokens,
|
||||
response_format=response_format,
|
||||
json_mode=json_mode,
|
||||
system_dynamic_suffix=system_dynamic_suffix,
|
||||
):
|
||||
yield event
|
||||
return
|
||||
@@ -1862,6 +2239,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
system=system,
|
||||
tools=tools,
|
||||
max_tokens=max_tokens,
|
||||
system_dynamic_suffix=system_dynamic_suffix,
|
||||
):
|
||||
yield event
|
||||
return
|
||||
@@ -1870,19 +2248,18 @@ class LiteLLMProvider(LLMProvider):
|
||||
if self._claude_code_oauth:
|
||||
billing = _claude_code_billing_header(messages)
|
||||
full_messages.append({"role": "system", "content": billing})
|
||||
if system:
|
||||
sys_msg: dict[str, Any] = {"role": "system", "content": system}
|
||||
if _model_supports_cache_control(self.model):
|
||||
sys_msg["cache_control"] = {"type": "ephemeral"}
|
||||
sys_msg = _build_system_message(system, system_dynamic_suffix, self.model)
|
||||
if sys_msg is not None:
|
||||
full_messages.append(sys_msg)
|
||||
full_messages.extend(messages)
|
||||
|
||||
if logger.isEnabledFor(logging.DEBUG) and full_messages:
|
||||
import json as _json
|
||||
from datetime import datetime as _dt
|
||||
from pathlib import Path as _Path
|
||||
|
||||
_debug_dir = _Path.home() / ".hive" / "debug_logs"
|
||||
from framework.config import HIVE_HOME as _HIVE_HOME
|
||||
|
||||
_debug_dir = _HIVE_HOME / "debug_logs"
|
||||
_debug_dir.mkdir(parents=True, exist_ok=True)
|
||||
_ts = _dt.now().strftime("%Y%m%d_%H%M%S_%f")
|
||||
_dump_file = _debug_dir / f"llm_request_{_ts}.json"
|
||||
@@ -1953,18 +2330,22 @@ class LiteLLMProvider(LLMProvider):
|
||||
# Ollama requires explicit tool_choice=auto for function calling
|
||||
# so future readers don't have to guess.
|
||||
kwargs.setdefault("tool_choice", "auto")
|
||||
elif self._hive_proxy_auth:
|
||||
# See `complete()` for the rationale: GLM behind the Hive
|
||||
# proxy needs forcing or it goes chat-mode on long contexts.
|
||||
kwargs.setdefault("tool_choice", "required")
|
||||
if response_format:
|
||||
kwargs["response_format"] = response_format
|
||||
# The Codex ChatGPT backend (Responses API) rejects several params.
|
||||
if self._codex_backend:
|
||||
kwargs.pop("max_tokens", None)
|
||||
kwargs.pop("stream_options", None)
|
||||
# Pass store directly to OpenAI in case litellm drops it as unknown
|
||||
if "extra_body" not in kwargs:
|
||||
kwargs["extra_body"] = {}
|
||||
kwargs["extra_body"]["store"] = False
|
||||
|
||||
request_summary = _summarize_request_for_log(kwargs)
|
||||
logger.debug(
|
||||
"[stream] prepared request: %s",
|
||||
json.dumps(request_summary, default=str),
|
||||
)
|
||||
if request_summary["system_only"]:
|
||||
logger.warning(
|
||||
"[stream] %s request has no non-system chat messages "
|
||||
@@ -2105,37 +2486,44 @@ class LiteLLMProvider(LLMProvider):
|
||||
type(usage).__name__,
|
||||
)
|
||||
cached_tokens = 0
|
||||
cache_creation_tokens = 0
|
||||
if usage:
|
||||
input_tokens = getattr(usage, "prompt_tokens", 0) or 0
|
||||
output_tokens = getattr(usage, "completion_tokens", 0) or 0
|
||||
_details = getattr(usage, "prompt_tokens_details", None)
|
||||
cached_tokens = (
|
||||
getattr(_details, "cached_tokens", 0) or 0
|
||||
if _details is not None
|
||||
else getattr(usage, "cache_read_input_tokens", 0) or 0
|
||||
)
|
||||
cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
|
||||
logger.debug(
|
||||
"[tokens] finish-chunk usage: input=%d output=%d cached=%d model=%s",
|
||||
"[tokens] finish-chunk usage: input=%d output=%d cached=%d cache_creation=%d model=%s",
|
||||
input_tokens,
|
||||
output_tokens,
|
||||
cached_tokens,
|
||||
cache_creation_tokens,
|
||||
self.model,
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
"[tokens] finish event: input=%d output=%d cached=%d stop=%s model=%s",
|
||||
"[tokens] finish event: input=%d output=%d cached=%d cache_creation=%d stop=%s model=%s",
|
||||
input_tokens,
|
||||
output_tokens,
|
||||
cached_tokens,
|
||||
cache_creation_tokens,
|
||||
choice.finish_reason,
|
||||
self.model,
|
||||
)
|
||||
cost_usd = _cost_from_tokens(
|
||||
self.model,
|
||||
input_tokens,
|
||||
output_tokens,
|
||||
cached_tokens,
|
||||
cache_creation_tokens,
|
||||
)
|
||||
tail_events.append(
|
||||
FinishEvent(
|
||||
stop_reason=choice.finish_reason,
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens,
|
||||
cached_tokens=cached_tokens,
|
||||
cache_creation_tokens=cache_creation_tokens,
|
||||
cost_usd=cost_usd,
|
||||
model=self.model,
|
||||
)
|
||||
)
|
||||
@@ -2155,19 +2543,36 @@ class LiteLLMProvider(LLMProvider):
|
||||
_usage = calculate_total_usage(chunks=_chunks)
|
||||
input_tokens = _usage.prompt_tokens or 0
|
||||
output_tokens = _usage.completion_tokens or 0
|
||||
_details = getattr(_usage, "prompt_tokens_details", None)
|
||||
cached_tokens = (
|
||||
getattr(_details, "cached_tokens", 0) or 0
|
||||
if _details is not None
|
||||
else getattr(_usage, "cache_read_input_tokens", 0) or 0
|
||||
)
|
||||
# `calculate_total_usage` aggregates token totals
|
||||
# but discards `prompt_tokens_details` — which is
|
||||
# where OpenRouter puts `cached_tokens` and
|
||||
# `cache_write_tokens`. Recover them directly
|
||||
# from the most recent chunk that carries usage.
|
||||
cached_tokens, cache_creation_tokens = 0, 0
|
||||
for _raw in reversed(_chunks):
|
||||
_raw_usage = getattr(_raw, "usage", None)
|
||||
if _raw_usage is None:
|
||||
continue
|
||||
_cr, _cc = _extract_cache_tokens(_raw_usage)
|
||||
if _cr or _cc:
|
||||
cached_tokens, cache_creation_tokens = _cr, _cc
|
||||
break
|
||||
logger.debug(
|
||||
"[tokens] post-loop chunks fallback: input=%d output=%d cached=%d model=%s",
|
||||
"[tokens] post-loop chunks fallback: input=%d output=%d "
|
||||
"cached=%d cache_creation=%d model=%s",
|
||||
input_tokens,
|
||||
output_tokens,
|
||||
cached_tokens,
|
||||
cache_creation_tokens,
|
||||
self.model,
|
||||
)
|
||||
cost_usd = _cost_from_tokens(
|
||||
self.model,
|
||||
input_tokens,
|
||||
output_tokens,
|
||||
cached_tokens,
|
||||
cache_creation_tokens,
|
||||
)
|
||||
# Patch the FinishEvent already queued with 0 tokens
|
||||
for _i, _ev in enumerate(tail_events):
|
||||
if isinstance(_ev, FinishEvent) and _ev.input_tokens == 0:
|
||||
@@ -2176,6 +2581,8 @@ class LiteLLMProvider(LLMProvider):
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens,
|
||||
cached_tokens=cached_tokens,
|
||||
cache_creation_tokens=cache_creation_tokens,
|
||||
cost_usd=cost_usd,
|
||||
model=_ev.model,
|
||||
)
|
||||
break
|
||||
@@ -2386,6 +2793,8 @@ class LiteLLMProvider(LLMProvider):
|
||||
tool_calls: list[dict[str, Any]] = []
|
||||
input_tokens = 0
|
||||
output_tokens = 0
|
||||
cached_tokens = 0
|
||||
cache_creation_tokens = 0
|
||||
stop_reason = ""
|
||||
model = self.model
|
||||
|
||||
@@ -2403,6 +2812,8 @@ class LiteLLMProvider(LLMProvider):
|
||||
elif isinstance(event, FinishEvent):
|
||||
input_tokens = event.input_tokens
|
||||
output_tokens = event.output_tokens
|
||||
cached_tokens = event.cached_tokens
|
||||
cache_creation_tokens = event.cache_creation_tokens
|
||||
stop_reason = event.stop_reason
|
||||
if event.model:
|
||||
model = event.model
|
||||
@@ -2415,6 +2826,8 @@ class LiteLLMProvider(LLMProvider):
|
||||
model=model,
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens,
|
||||
cached_tokens=cached_tokens,
|
||||
cache_creation_tokens=cache_creation_tokens,
|
||||
stop_reason=stop_reason,
|
||||
raw_response={"tool_calls": tool_calls} if tool_calls else None,
|
||||
)
|
||||
|
||||
@@ -155,8 +155,11 @@ class MockLLMProvider(LLMProvider):
|
||||
response_format: dict[str, Any] | None = None,
|
||||
json_mode: bool = False,
|
||||
max_retries: int | None = None,
|
||||
system_dynamic_suffix: str | None = None,
|
||||
) -> LLMResponse:
|
||||
"""Async mock completion (no I/O, returns immediately)."""
|
||||
if system_dynamic_suffix:
|
||||
system = f"{system}\n\n{system_dynamic_suffix}" if system else system_dynamic_suffix
|
||||
return self.complete(
|
||||
messages=messages,
|
||||
system=system,
|
||||
@@ -173,6 +176,7 @@ class MockLLMProvider(LLMProvider):
|
||||
system: str = "",
|
||||
tools: list[Tool] | None = None,
|
||||
max_tokens: int = 4096,
|
||||
system_dynamic_suffix: str | None = None,
|
||||
) -> AsyncIterator[StreamEvent]:
|
||||
"""Stream a mock completion as word-level TextDeltaEvents.
|
||||
|
||||
@@ -180,6 +184,8 @@ class MockLLMProvider(LLMProvider):
|
||||
TextDeltaEvent with an accumulating snapshot, exercising the full
|
||||
streaming pipeline without any API calls.
|
||||
"""
|
||||
if system_dynamic_suffix:
|
||||
system = f"{system}\n\n{system_dynamic_suffix}" if system else system_dynamic_suffix
|
||||
content = self._generate_mock_response(system=system, json_mode=False)
|
||||
words = content.split(" ")
|
||||
accumulated = ""
|
||||
|
||||
@@ -9,47 +9,65 @@
|
||||
"label": "Haiku 4.5 - Fast + cheap",
|
||||
"recommended": false,
|
||||
"max_tokens": 64000,
|
||||
"max_context_tokens": 136000
|
||||
"max_context_tokens": 136000,
|
||||
"supports_vision": true
|
||||
},
|
||||
{
|
||||
"id": "claude-sonnet-4-5-20250929",
|
||||
"label": "Sonnet 4.5 - Best balance",
|
||||
"recommended": false,
|
||||
"max_tokens": 64000,
|
||||
"max_context_tokens": 136000
|
||||
"max_context_tokens": 136000,
|
||||
"supports_vision": true
|
||||
},
|
||||
{
|
||||
"id": "claude-opus-4-6",
|
||||
"label": "Opus 4.6 - Most capable",
|
||||
"recommended": true,
|
||||
"max_tokens": 128000,
|
||||
"max_context_tokens": 872000
|
||||
"max_context_tokens": 872000,
|
||||
"supports_vision": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"openai": {
|
||||
"default_model": "gpt-5.4",
|
||||
"default_model": "gpt-5.5",
|
||||
"models": [
|
||||
{
|
||||
"id": "gpt-5.4",
|
||||
"label": "GPT-5.4 - Best intelligence",
|
||||
"id": "gpt-5.5",
|
||||
"label": "GPT-5.5 - Frontier coding + reasoning",
|
||||
"recommended": true,
|
||||
"max_tokens": 128000,
|
||||
"max_context_tokens": 960000
|
||||
"max_context_tokens": 1050000,
|
||||
"pricing_usd_per_mtok": {
|
||||
"input": 5.00,
|
||||
"output": 30.00
|
||||
},
|
||||
"supports_vision": true
|
||||
},
|
||||
{
|
||||
"id": "gpt-5.4",
|
||||
"label": "GPT-5.4 - Previous flagship",
|
||||
"recommended": false,
|
||||
"max_tokens": 128000,
|
||||
"max_context_tokens": 960000,
|
||||
"supports_vision": true
|
||||
},
|
||||
{
|
||||
"id": "gpt-5.4-mini",
|
||||
"label": "GPT-5.4 Mini - Faster + cheaper",
|
||||
"recommended": false,
|
||||
"max_tokens": 128000,
|
||||
"max_context_tokens": 400000
|
||||
"max_context_tokens": 400000,
|
||||
"supports_vision": true
|
||||
},
|
||||
{
|
||||
"id": "gpt-5.4-nano",
|
||||
"label": "GPT-5.4 Nano - Cheapest high-volume",
|
||||
"recommended": false,
|
||||
"max_tokens": 128000,
|
||||
"max_context_tokens": 400000
|
||||
"max_context_tokens": 400000,
|
||||
"supports_vision": true
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -61,14 +79,16 @@
|
||||
"label": "Gemini 3 Flash - Fast",
|
||||
"recommended": false,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 240000
|
||||
"max_context_tokens": 240000,
|
||||
"supports_vision": true
|
||||
},
|
||||
{
|
||||
"id": "gemini-3.1-pro-preview-customtools",
|
||||
"label": "Gemini 3.1 Pro - Best quality",
|
||||
"recommended": true,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 240000
|
||||
"max_context_tokens": 240000,
|
||||
"supports_vision": true
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -80,28 +100,32 @@
|
||||
"label": "GPT-OSS 120B - Best reasoning",
|
||||
"recommended": true,
|
||||
"max_tokens": 65536,
|
||||
"max_context_tokens": 131072
|
||||
"max_context_tokens": 131072,
|
||||
"supports_vision": false
|
||||
},
|
||||
{
|
||||
"id": "openai/gpt-oss-20b",
|
||||
"label": "GPT-OSS 20B - Fast + cheaper",
|
||||
"recommended": false,
|
||||
"max_tokens": 65536,
|
||||
"max_context_tokens": 131072
|
||||
"max_context_tokens": 131072,
|
||||
"supports_vision": false
|
||||
},
|
||||
{
|
||||
"id": "llama-3.3-70b-versatile",
|
||||
"label": "Llama 3.3 70B - General purpose",
|
||||
"recommended": false,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 131072
|
||||
"max_context_tokens": 131072,
|
||||
"supports_vision": false
|
||||
},
|
||||
{
|
||||
"id": "llama-3.1-8b-instant",
|
||||
"label": "Llama 3.1 8B - Fastest",
|
||||
"recommended": false,
|
||||
"max_tokens": 131072,
|
||||
"max_context_tokens": 131072
|
||||
"max_context_tokens": 131072,
|
||||
"supports_vision": false
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -113,28 +137,24 @@
|
||||
"label": "GPT-OSS 120B - Best production reasoning",
|
||||
"recommended": true,
|
||||
"max_tokens": 40960,
|
||||
"max_context_tokens": 131072
|
||||
},
|
||||
{
|
||||
"id": "llama3.1-8b",
|
||||
"label": "Llama 3.1 8B - Fastest production",
|
||||
"recommended": false,
|
||||
"max_tokens": 8192,
|
||||
"max_context_tokens": 32768
|
||||
"max_context_tokens": 131072,
|
||||
"supports_vision": false
|
||||
},
|
||||
{
|
||||
"id": "zai-glm-4.7",
|
||||
"label": "Z.ai GLM 4.7 - Strong coding preview",
|
||||
"recommended": true,
|
||||
"max_tokens": 40960,
|
||||
"max_context_tokens": 131072
|
||||
"max_context_tokens": 131072,
|
||||
"supports_vision": false
|
||||
},
|
||||
{
|
||||
"id": "qwen-3-235b-a22b-instruct-2507",
|
||||
"label": "Qwen 3 235B Instruct - Frontier preview",
|
||||
"recommended": false,
|
||||
"max_tokens": 40960,
|
||||
"max_context_tokens": 131072
|
||||
"max_context_tokens": 131072,
|
||||
"supports_vision": false
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -145,15 +165,21 @@
|
||||
"id": "MiniMax-M2.7",
|
||||
"label": "MiniMax M2.7 - Best coding quality",
|
||||
"recommended": true,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 204800
|
||||
"max_tokens": 40960,
|
||||
"max_context_tokens": 180000,
|
||||
"pricing_usd_per_mtok": {
|
||||
"input": 0.30,
|
||||
"output": 1.20
|
||||
},
|
||||
"supports_vision": false
|
||||
},
|
||||
{
|
||||
"id": "MiniMax-M2.5",
|
||||
"label": "MiniMax M2.5 - Strong value",
|
||||
"recommended": false,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 204800
|
||||
"max_tokens": 40960,
|
||||
"max_context_tokens": 180000,
|
||||
"supports_vision": false
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -165,28 +191,32 @@
|
||||
"label": "Mistral Large 3 - Best quality",
|
||||
"recommended": true,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 256000
|
||||
"max_context_tokens": 256000,
|
||||
"supports_vision": true
|
||||
},
|
||||
{
|
||||
"id": "mistral-medium-2508",
|
||||
"label": "Mistral Medium 3.1 - Balanced",
|
||||
"recommended": false,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 128000
|
||||
"max_context_tokens": 128000,
|
||||
"supports_vision": true
|
||||
},
|
||||
{
|
||||
"id": "mistral-small-2603",
|
||||
"label": "Mistral Small 4 - Fast + capable",
|
||||
"recommended": false,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 256000
|
||||
"max_context_tokens": 256000,
|
||||
"supports_vision": true
|
||||
},
|
||||
{
|
||||
"id": "codestral-2508",
|
||||
"label": "Codestral - Coding specialist",
|
||||
"recommended": false,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 128000
|
||||
"max_context_tokens": 128000,
|
||||
"supports_vision": false
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -198,47 +228,71 @@
|
||||
"label": "DeepSeek V3.1 - Best general coding",
|
||||
"recommended": true,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 128000
|
||||
"max_context_tokens": 128000,
|
||||
"supports_vision": false
|
||||
},
|
||||
{
|
||||
"id": "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8",
|
||||
"label": "Qwen3 Coder 480B - Advanced coding",
|
||||
"recommended": false,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 262144
|
||||
"max_context_tokens": 262144,
|
||||
"supports_vision": false
|
||||
},
|
||||
{
|
||||
"id": "openai/gpt-oss-120b",
|
||||
"label": "GPT-OSS 120B - Strong reasoning",
|
||||
"recommended": false,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 128000
|
||||
"max_context_tokens": 128000,
|
||||
"supports_vision": false
|
||||
},
|
||||
{
|
||||
"id": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
||||
"label": "Llama 3.3 70B Turbo - Fast baseline",
|
||||
"recommended": false,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 131072
|
||||
"max_context_tokens": 131072,
|
||||
"supports_vision": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"deepseek": {
|
||||
"default_model": "deepseek-chat",
|
||||
"default_model": "deepseek-v4-pro",
|
||||
"models": [
|
||||
{
|
||||
"id": "deepseek-chat",
|
||||
"label": "DeepSeek Chat - Fast default",
|
||||
"id": "deepseek-v4-pro",
|
||||
"label": "DeepSeek V4 Pro - Most capable",
|
||||
"recommended": true,
|
||||
"max_tokens": 8192,
|
||||
"max_context_tokens": 128000
|
||||
"max_tokens": 384000,
|
||||
"max_context_tokens": 1000000,
|
||||
"pricing_usd_per_mtok": {
|
||||
"input": 1.74,
|
||||
"output": 3.48,
|
||||
"cache_read": 0.145
|
||||
},
|
||||
"supports_vision": false
|
||||
},
|
||||
{
|
||||
"id": "deepseek-v4-flash",
|
||||
"label": "DeepSeek V4 Flash - Fast + cheap",
|
||||
"recommended": true,
|
||||
"max_tokens": 384000,
|
||||
"max_context_tokens": 1000000,
|
||||
"pricing_usd_per_mtok": {
|
||||
"input": 0.14,
|
||||
"output": 0.28,
|
||||
"cache_read": 0.028
|
||||
},
|
||||
"supports_vision": false
|
||||
},
|
||||
{
|
||||
"id": "deepseek-reasoner",
|
||||
"label": "DeepSeek Reasoner - Deep thinking",
|
||||
"label": "DeepSeek Reasoner - Legacy (deprecating)",
|
||||
"recommended": false,
|
||||
"max_tokens": 64000,
|
||||
"max_context_tokens": 128000
|
||||
"max_context_tokens": 128000,
|
||||
"supports_vision": false
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -250,7 +304,13 @@
|
||||
"label": "Kimi K2.5 - Best coding",
|
||||
"recommended": true,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 200000
|
||||
"max_context_tokens": 200000,
|
||||
"pricing_usd_per_mtok": {
|
||||
"input": 0.60,
|
||||
"output": 2.50,
|
||||
"cache_read": 0.15
|
||||
},
|
||||
"supports_vision": true
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -262,21 +322,30 @@
|
||||
"label": "Queen - Hive native",
|
||||
"recommended": true,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 180000
|
||||
"max_context_tokens": 180000,
|
||||
"supports_vision": false
|
||||
},
|
||||
{
|
||||
"id": "kimi-2.5",
|
||||
"id": "kimi-k2.5",
|
||||
"label": "Kimi 2.5 - Via Hive",
|
||||
"recommended": false,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 240000
|
||||
"max_context_tokens": 240000,
|
||||
"supports_vision": true
|
||||
},
|
||||
{
|
||||
"id": "GLM-5",
|
||||
"label": "GLM-5 - Via Hive",
|
||||
"id": "glm-5.1",
|
||||
"label": "GLM-5.1 - Via Hive",
|
||||
"recommended": false,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 180000
|
||||
"max_context_tokens": 180000,
|
||||
"pricing_usd_per_mtok": {
|
||||
"input": 1.40,
|
||||
"output": 4.40,
|
||||
"cache_read": 0.26,
|
||||
"cache_creation": 0.0
|
||||
},
|
||||
"supports_vision": false
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -288,70 +357,82 @@
|
||||
"label": "GPT-5.4 - Best overall",
|
||||
"recommended": true,
|
||||
"max_tokens": 128000,
|
||||
"max_context_tokens": 922000
|
||||
"max_context_tokens": 872000,
|
||||
"supports_vision": true
|
||||
},
|
||||
{
|
||||
"id": "anthropic/claude-sonnet-4.6",
|
||||
"label": "Claude Sonnet 4.6 - Best coding balance",
|
||||
"recommended": false,
|
||||
"max_tokens": 64000,
|
||||
"max_context_tokens": 936000
|
||||
"max_context_tokens": 872000,
|
||||
"supports_vision": true
|
||||
},
|
||||
{
|
||||
"id": "anthropic/claude-opus-4.6",
|
||||
"label": "Claude Opus 4.6 - Most capable",
|
||||
"recommended": false,
|
||||
"max_tokens": 128000,
|
||||
"max_context_tokens": 872000
|
||||
"max_context_tokens": 872000,
|
||||
"supports_vision": true
|
||||
},
|
||||
{
|
||||
"id": "google/gemini-3.1-pro-preview-customtools",
|
||||
"label": "Gemini 3.1 Pro Preview - Long-context reasoning",
|
||||
"recommended": false,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 1048576
|
||||
},
|
||||
{
|
||||
"id": "deepseek/deepseek-v3.2",
|
||||
"label": "DeepSeek V3.2 - Best value",
|
||||
"recommended": false,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 163840
|
||||
"max_context_tokens": 872000,
|
||||
"supports_vision": true
|
||||
},
|
||||
{
|
||||
"id": "qwen/qwen3.6-plus",
|
||||
"label": "Qwen 3.6 Plus - Strong reasoning",
|
||||
"recommended": false,
|
||||
"recommended": true,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 131072
|
||||
"max_context_tokens": 240000,
|
||||
"supports_vision": false
|
||||
},
|
||||
{
|
||||
"id": "z-ai/glm-5v-turbo",
|
||||
"label": "GLM-5V Turbo - Vision capable",
|
||||
"recommended": false,
|
||||
"max_tokens": 16384,
|
||||
"max_context_tokens": 128000
|
||||
"recommended": true,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 192000,
|
||||
"supports_vision": true
|
||||
},
|
||||
{
|
||||
"id": "x-ai/grok-4.20",
|
||||
"label": "Grok 4.20 - xAI flagship",
|
||||
"id": "z-ai/glm-5.1",
|
||||
"label": "GLM-5.1 - Better but Slower",
|
||||
"recommended": true,
|
||||
"max_tokens": 40960,
|
||||
"max_context_tokens": 192000,
|
||||
"pricing_usd_per_mtok": {
|
||||
"input": 1.40,
|
||||
"output": 4.40,
|
||||
"cache_read": 0.26,
|
||||
"cache_creation": 0.0
|
||||
},
|
||||
"supports_vision": false
|
||||
},
|
||||
{
|
||||
"id": "minimax/minimax-m2.7",
|
||||
"label": "Minimax M2.7 - Minimax flagship",
|
||||
"recommended": false,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 131072
|
||||
"max_tokens": 40960,
|
||||
"max_context_tokens": 180000,
|
||||
"pricing_usd_per_mtok": {
|
||||
"input": 0.30,
|
||||
"output": 1.20
|
||||
},
|
||||
"supports_vision": false
|
||||
},
|
||||
{
|
||||
"id": "xiaomi/mimo-v2-pro",
|
||||
"label": "MiMo V2 Pro - Xiaomi multimodal",
|
||||
"recommended": false,
|
||||
"max_tokens": 16384,
|
||||
"max_context_tokens": 65536
|
||||
},
|
||||
{
|
||||
"id": "stepfun/step-3.5-flash",
|
||||
"label": "Step 3.5 Flash - Fast inference",
|
||||
"recommended": false,
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 128000
|
||||
"recommended": true,
|
||||
"max_tokens": 64000,
|
||||
"max_context_tokens": 240000,
|
||||
"supports_vision": true
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -366,7 +447,7 @@
|
||||
"zai_code": {
|
||||
"provider": "openai",
|
||||
"api_key_env_var": "ZAI_API_KEY",
|
||||
"model": "glm-5",
|
||||
"model": "glm-5.1",
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 180000,
|
||||
"api_base": "https://api.z.ai/api/coding/paas/v4"
|
||||
@@ -382,8 +463,8 @@
|
||||
"provider": "minimax",
|
||||
"api_key_env_var": "MINIMAX_API_KEY",
|
||||
"model": "MiniMax-M2.7",
|
||||
"max_tokens": 32768,
|
||||
"max_context_tokens": 204800,
|
||||
"max_tokens": 40960,
|
||||
"max_context_tokens": 180800,
|
||||
"api_base": "https://api.minimax.io/v1"
|
||||
},
|
||||
"kimi_code": {
|
||||
@@ -408,13 +489,13 @@
|
||||
"recommended": true
|
||||
},
|
||||
{
|
||||
"id": "kimi-2.5",
|
||||
"label": "kimi-2.5",
|
||||
"id": "kimi-k2.5",
|
||||
"label": "kimi-k2.5",
|
||||
"recommended": false
|
||||
},
|
||||
{
|
||||
"id": "GLM-5",
|
||||
"label": "GLM-5",
|
||||
"id": "glm-5.1",
|
||||
"label": "glm-5.1",
|
||||
"recommended": false
|
||||
}
|
||||
]
|
||||
@@ -432,4 +513,4 @@
|
||||
"api_base": "http://localhost:11434"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -27,6 +27,28 @@ def _require_list(value: Any, path: str) -> list[Any]:
|
||||
return value
|
||||
|
||||
|
||||
_PRICING_KEYS = ("input", "output", "cache_read", "cache_creation")
|
||||
|
||||
|
||||
def _validate_pricing(value: Any, path: str) -> None:
|
||||
"""Validate an optional ``pricing_usd_per_mtok`` block.
|
||||
|
||||
Keys are USD-per-million-tokens rates. ``input``/``output`` are required;
|
||||
``cache_read``/``cache_creation`` are optional. All values must be
|
||||
non-negative numbers. Used as a last-resort fallback when neither the
|
||||
provider nor LiteLLM's catalog reports a cost.
|
||||
"""
|
||||
pricing = _require_mapping(value, path)
|
||||
for key in ("input", "output"):
|
||||
if key not in pricing:
|
||||
raise ModelCatalogError(f"{path}.{key} is required")
|
||||
for key, rate in pricing.items():
|
||||
if key not in _PRICING_KEYS:
|
||||
raise ModelCatalogError(f"{path}.{key} is not a recognized pricing field")
|
||||
if not isinstance(rate, (int, float)) or isinstance(rate, bool) or rate < 0:
|
||||
raise ModelCatalogError(f"{path}.{key} must be a non-negative number")
|
||||
|
||||
|
||||
def _validate_model_catalog(data: dict[str, Any]) -> dict[str, Any]:
|
||||
providers = _require_mapping(data.get("providers"), "providers")
|
||||
|
||||
@@ -69,6 +91,14 @@ def _validate_model_catalog(data: dict[str, Any]) -> dict[str, Any]:
|
||||
if not isinstance(value, int) or value <= 0:
|
||||
raise ModelCatalogError(f"{model_path}.{key} must be a positive integer")
|
||||
|
||||
pricing = model_map.get("pricing_usd_per_mtok")
|
||||
if pricing is not None:
|
||||
_validate_pricing(pricing, f"{model_path}.pricing_usd_per_mtok")
|
||||
|
||||
supports_vision = model_map.get("supports_vision")
|
||||
if supports_vision is not None and not isinstance(supports_vision, bool):
|
||||
raise ModelCatalogError(f"{model_path}.supports_vision must be a boolean when present")
|
||||
|
||||
if not default_found:
|
||||
raise ModelCatalogError(
|
||||
f"{provider_path}.default_model={default_model!r} is not present in {provider_path}.models"
|
||||
@@ -184,6 +214,53 @@ def get_model_limits(provider: str, model_id: str) -> tuple[int, int] | None:
|
||||
return int(model["max_tokens"]), int(model["max_context_tokens"])
|
||||
|
||||
|
||||
def get_model_pricing(model_id: str) -> dict[str, float] | None:
|
||||
"""Return ``pricing_usd_per_mtok`` for a model id, searching all providers.
|
||||
|
||||
Returns ``None`` when the model is absent from the catalog or has no
|
||||
pricing entry. Used by the cost-extraction fallback in ``litellm.py``
|
||||
when the provider response and LiteLLM's catalog both come up empty.
|
||||
"""
|
||||
if not model_id:
|
||||
return None
|
||||
for provider_info in load_model_catalog()["providers"].values():
|
||||
for model in provider_info["models"]:
|
||||
if model["id"] == model_id:
|
||||
pricing = model.get("pricing_usd_per_mtok")
|
||||
if pricing is None:
|
||||
return None
|
||||
return {key: float(rate) for key, rate in pricing.items()}
|
||||
return None
|
||||
|
||||
|
||||
def model_supports_vision(model_id: str) -> bool:
|
||||
"""Return whether *model_id* supports image inputs per the curated catalog.
|
||||
|
||||
Looks up the bare model id (and the provider-prefix-stripped form) in the
|
||||
catalog. Returns the model's ``supports_vision`` flag when found, defaulting
|
||||
to ``True`` for unknown models or when the flag is absent — assume vision
|
||||
capable for hosted providers, since modern frontier models support images
|
||||
by default and the captioning fallback is more expensive than just letting
|
||||
the provider handle the image.
|
||||
"""
|
||||
if not model_id:
|
||||
return True
|
||||
|
||||
candidates = [model_id]
|
||||
if "/" in model_id:
|
||||
candidates.append(model_id.split("/", 1)[1])
|
||||
|
||||
for candidate in candidates:
|
||||
for provider_info in load_model_catalog()["providers"].values():
|
||||
for model in provider_info["models"]:
|
||||
if model["id"] == candidate:
|
||||
flag = model.get("supports_vision")
|
||||
if isinstance(flag, bool):
|
||||
return flag
|
||||
return True
|
||||
return True
|
||||
|
||||
|
||||
def get_preset(preset_id: str) -> dict[str, Any] | None:
|
||||
"""Return one preset entry."""
|
||||
preset = load_model_catalog()["presets"].get(preset_id)
|
||||
|
||||
@@ -10,12 +10,24 @@ from typing import Any
|
||||
|
||||
@dataclass
|
||||
class LLMResponse:
|
||||
"""Response from an LLM call."""
|
||||
"""Response from an LLM call.
|
||||
|
||||
``cached_tokens`` and ``cache_creation_tokens`` are subsets of
|
||||
``input_tokens`` (providers report them inside ``prompt_tokens``).
|
||||
Surface them for visibility; do not add to a total.
|
||||
|
||||
``cost_usd`` is the per-call USD cost when the provider / pricing table
|
||||
can produce one (Anthropic, OpenAI, OpenRouter are supported). 0.0 when
|
||||
unknown or unpriced — treat as "unreported", not "free".
|
||||
"""
|
||||
|
||||
content: str
|
||||
model: str
|
||||
input_tokens: int = 0
|
||||
output_tokens: int = 0
|
||||
cached_tokens: int = 0
|
||||
cache_creation_tokens: int = 0
|
||||
cost_usd: float = 0.0
|
||||
stop_reason: str = ""
|
||||
raw_response: Any = None
|
||||
|
||||
@@ -110,19 +122,28 @@ class LLMProvider(ABC):
|
||||
response_format: dict[str, Any] | None = None,
|
||||
json_mode: bool = False,
|
||||
max_retries: int | None = None,
|
||||
system_dynamic_suffix: str | None = None,
|
||||
) -> "LLMResponse":
|
||||
"""Async version of complete(). Non-blocking on the event loop.
|
||||
|
||||
Default implementation offloads the sync complete() to a thread pool.
|
||||
Subclasses SHOULD override for native async I/O.
|
||||
|
||||
``system_dynamic_suffix`` is an optional per-turn tail for providers
|
||||
that honor ``cache_control`` (see LiteLLMProvider for semantics).
|
||||
The default implementation concatenates it onto ``system`` since the
|
||||
sync ``complete()`` path does not support the split.
|
||||
"""
|
||||
combined_system = system
|
||||
if system_dynamic_suffix:
|
||||
combined_system = f"{system}\n\n{system_dynamic_suffix}" if system else system_dynamic_suffix
|
||||
loop = asyncio.get_running_loop()
|
||||
return await loop.run_in_executor(
|
||||
None,
|
||||
partial(
|
||||
self.complete,
|
||||
messages=messages,
|
||||
system=system,
|
||||
system=combined_system,
|
||||
tools=tools,
|
||||
max_tokens=max_tokens,
|
||||
response_format=response_format,
|
||||
@@ -137,6 +158,7 @@ class LLMProvider(ABC):
|
||||
system: str = "",
|
||||
tools: list[Tool] | None = None,
|
||||
max_tokens: int = 4096,
|
||||
system_dynamic_suffix: str | None = None,
|
||||
) -> AsyncIterator["StreamEvent"]:
|
||||
"""
|
||||
Stream a completion as an async iterator of StreamEvents.
|
||||
@@ -147,6 +169,9 @@ class LLMProvider(ABC):
|
||||
Tool orchestration is the CALLER's responsibility:
|
||||
- Caller detects ToolCallEvent, executes tool, adds result
|
||||
to messages, calls stream() again.
|
||||
|
||||
``system_dynamic_suffix`` is forwarded to ``acomplete``; see its
|
||||
docstring for the two-block split semantics.
|
||||
"""
|
||||
from framework.llm.stream_events import (
|
||||
FinishEvent,
|
||||
@@ -159,6 +184,7 @@ class LLMProvider(ABC):
|
||||
system=system,
|
||||
tools=tools,
|
||||
max_tokens=max_tokens,
|
||||
system_dynamic_suffix=system_dynamic_suffix,
|
||||
)
|
||||
yield TextDeltaEvent(content=response.content, snapshot=response.content)
|
||||
yield TextEndEvent(full_text=response.content)
|
||||
@@ -166,6 +192,9 @@ class LLMProvider(ABC):
|
||||
stop_reason=response.stop_reason,
|
||||
input_tokens=response.input_tokens,
|
||||
output_tokens=response.output_tokens,
|
||||
cached_tokens=response.cached_tokens,
|
||||
cache_creation_tokens=response.cache_creation_tokens,
|
||||
cost_usd=response.cost_usd,
|
||||
model=response.model,
|
||||
)
|
||||
|
||||
|
||||
@@ -65,13 +65,23 @@ class ReasoningDeltaEvent:
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FinishEvent:
|
||||
"""The LLM has finished generating."""
|
||||
"""The LLM has finished generating.
|
||||
|
||||
``cached_tokens`` and ``cache_creation_tokens`` are subsets of
|
||||
``input_tokens`` — providers count both inside ``prompt_tokens`` already.
|
||||
Surface them separately for visibility; never add to a total.
|
||||
|
||||
``cost_usd`` is the per-turn USD cost when the provider or LiteLLM's
|
||||
pricing table supplies one; 0.0 means unreported (not free).
|
||||
"""
|
||||
|
||||
type: Literal["finish"] = "finish"
|
||||
stop_reason: str = ""
|
||||
input_tokens: int = 0
|
||||
output_tokens: int = 0
|
||||
cached_tokens: int = 0
|
||||
cache_creation_tokens: int = 0
|
||||
cost_usd: float = 0.0
|
||||
model: str = ""
|
||||
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from datetime import UTC
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from framework.config import get_hive_config, get_preferred_model
|
||||
from framework.config import HIVE_HOME as _HIVE_HOME, get_hive_config, get_preferred_model
|
||||
from framework.credentials.validation import (
|
||||
ensure_credential_key_env as _ensure_credential_key_env,
|
||||
)
|
||||
@@ -558,7 +558,7 @@ ANTIGRAVITY_IDE_STATE_DB = (
|
||||
# Linux fallback for the IDE state DB
|
||||
ANTIGRAVITY_IDE_STATE_DB_LINUX = Path.home() / ".config" / "Antigravity" / "User" / "globalStorage" / "state.vscdb"
|
||||
# Antigravity credentials stored by native OAuth implementation
|
||||
ANTIGRAVITY_AUTH_FILE = Path.home() / ".hive" / "antigravity-accounts.json"
|
||||
ANTIGRAVITY_AUTH_FILE = _HIVE_HOME / "antigravity-accounts.json"
|
||||
|
||||
ANTIGRAVITY_OAUTH_TOKEN_URL = "https://oauth2.googleapis.com/token"
|
||||
_ANTIGRAVITY_TOKEN_LIFETIME_SECS = 3600 # Google access tokens expire in 1 hour
|
||||
@@ -1389,7 +1389,7 @@ class AgentLoader:
|
||||
)
|
||||
|
||||
if storage_path is None:
|
||||
storage_path = Path.home() / ".hive" / "agents" / agent_path.name / worker_name
|
||||
storage_path = _HIVE_HOME / "agents" / agent_path.name / worker_name
|
||||
storage_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
runner = cls(
|
||||
@@ -1503,6 +1503,7 @@ class AgentLoader:
|
||||
from framework.pipeline.stages.mcp_registry import McpRegistryStage
|
||||
from framework.pipeline.stages.skill_registry import SkillRegistryStage
|
||||
from framework.skills.config import SkillsConfig
|
||||
from framework.skills.discovery import ExtraScope
|
||||
|
||||
configure_logging(level="INFO", format="auto")
|
||||
|
||||
@@ -1545,6 +1546,19 @@ class AgentLoader:
|
||||
default_skills=getattr(self, "_agent_default_skills", None),
|
||||
skills=getattr(self, "_agent_skills", None),
|
||||
),
|
||||
# Surface the colony's flat ``skills/`` directory as a
|
||||
# ``colony_ui`` extra scope so SKILL.md files written there
|
||||
# by ``create_colony`` (or the HTTP routes) are picked up
|
||||
# with correct provenance. The legacy nested
|
||||
# ``<colony>/.hive/skills/`` path is still picked up via
|
||||
# project-scope auto-discovery (project_root above).
|
||||
extra_scope_dirs=[
|
||||
ExtraScope(
|
||||
directory=self.agent_path / "skills",
|
||||
label="colony_ui",
|
||||
priority=3,
|
||||
)
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@@ -51,6 +51,14 @@ _DEFAULT_LOCAL_SERVERS: dict[str, dict[str, Any]] = {
|
||||
"description": "File I/O: read, write, edit, search, list, run commands",
|
||||
"args": ["run", "python", "files_server.py", "--stdio"],
|
||||
},
|
||||
"terminal-tools": {
|
||||
"description": "Terminal capabilities",
|
||||
"args": ["run", "python", "terminal_tools_server.py", "--stdio"],
|
||||
},
|
||||
"chart-tools": {
|
||||
"description": "BI/financial chart + diagram rendering: ECharts, Mermaid",
|
||||
"args": ["run", "python", "chart_tools_server.py", "--stdio"],
|
||||
},
|
||||
}
|
||||
|
||||
# Aliases that earlier versions of ensure_defaults wrote under the wrong name.
|
||||
@@ -58,14 +66,22 @@ _DEFAULT_LOCAL_SERVERS: dict[str, dict[str, Any]] = {
|
||||
# name so the active agents (queen, credential_tester) can find their tools.
|
||||
_STALE_DEFAULT_ALIASES: dict[str, str] = {
|
||||
"hive_tools": "hive-tools",
|
||||
# 2026-04-30: shell-tools renamed to terminal-tools. Drop the stale name
|
||||
# on next ensure_defaults() so the queen's allowlist (which now includes
|
||||
# @server:terminal-tools) actually finds a server with the new name.
|
||||
"terminal-tools": "shell-tools",
|
||||
}
|
||||
|
||||
|
||||
class MCPRegistry:
|
||||
"""Manages local MCP server state in ~/.hive/mcp_registry/."""
|
||||
"""Manages local MCP server state in $HIVE_HOME/mcp_registry/."""
|
||||
|
||||
def __init__(self, base_path: Path | None = None):
|
||||
self._base = base_path or Path.home() / ".hive" / "mcp_registry"
|
||||
if base_path is None:
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
base_path = HIVE_HOME / "mcp_registry"
|
||||
self._base = base_path
|
||||
self._installed_path = self._base / "installed.json"
|
||||
self._config_path = self._base / "config.json"
|
||||
self._cache_dir = self._base / "cache"
|
||||
@@ -73,7 +89,30 @@ class MCPRegistry:
|
||||
# ── Initialization ──────────────────────────────────────────────
|
||||
|
||||
def initialize(self) -> None:
|
||||
"""Create directory structure and default files if missing."""
|
||||
"""Create directory structure, default files, and seed bundled servers.
|
||||
|
||||
Every read path (queen orchestrator, pipeline stage, CLI, routes)
|
||||
calls this — keeping the seeding here means a fresh ``HIVE_HOME``
|
||||
(e.g. the desktop's per-user dir under ``~/.config/Hive/users/<hash>/``
|
||||
or ``~/Library/Application Support/Hive/users/<hash>/``) is always
|
||||
populated with ``hive_tools`` / ``gcu-tools`` / ``files-tools`` /
|
||||
``shell-tools`` before any agent code reads ``installed.json``.
|
||||
Without this, ``load_agent_selection()`` resolves an empty registry
|
||||
and emits "Server X requested but not installed" warnings even
|
||||
though the server is bundled.
|
||||
|
||||
Idempotent — already-installed entries are left untouched.
|
||||
"""
|
||||
self._bootstrap_io()
|
||||
self._seed_defaults()
|
||||
|
||||
def _bootstrap_io(self) -> None:
|
||||
"""Create the registry directory + empty config/installed files.
|
||||
|
||||
Split out from ``initialize()`` so ``_seed_defaults()`` can call it
|
||||
without re-entering the seeding logic (which would recurse via
|
||||
``_read_installed()`` → ``initialize()``).
|
||||
"""
|
||||
self._base.mkdir(parents=True, exist_ok=True)
|
||||
self._cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@@ -84,21 +123,33 @@ class MCPRegistry:
|
||||
self._write_json(self._installed_path, {"servers": {}})
|
||||
|
||||
def ensure_defaults(self) -> list[str]:
|
||||
"""Seed the built-in local MCP servers (hive-tools, gcu-tools, files-tools).
|
||||
"""Public alias kept for the ``hive mcp init`` CLI command.
|
||||
|
||||
Idempotent — servers already present are left untouched. Skips seeding
|
||||
entirely when the source-tree ``tools/`` directory cannot be located
|
||||
(e.g. when Hive is installed from a wheel rather than a checkout).
|
||||
|
||||
Returns the list of names that were newly registered.
|
||||
Returns the list of newly-registered server names so the CLI can
|
||||
print them. Same idempotent seeding logic as ``initialize()``.
|
||||
"""
|
||||
self.initialize()
|
||||
self._bootstrap_io()
|
||||
return self._seed_defaults()
|
||||
|
||||
def _seed_defaults(self) -> list[str]:
|
||||
"""Idempotently register the bundled default local servers.
|
||||
|
||||
Skips entirely when the source-tree ``tools/`` directory cannot
|
||||
be located (e.g. wheel installs). Returns the list of names that
|
||||
were newly registered.
|
||||
|
||||
Also runs a self-heal pass over already-registered defaults: if an
|
||||
entry's stdio cwd is unreachable on this machine (e.g. the registry
|
||||
was copied from another developer's box and points at their
|
||||
``/Users/<them>/...`` path), the entry is overwritten with the
|
||||
canonical config so the queen can actually spawn it. The user's
|
||||
``enabled`` toggle and ``overrides`` are preserved.
|
||||
"""
|
||||
# parents: [0]=loader, [1]=framework, [2]=core, [3]=repo root
|
||||
tools_dir = Path(__file__).resolve().parents[3] / "tools"
|
||||
if not tools_dir.is_dir():
|
||||
logger.debug(
|
||||
"MCPRegistry.ensure_defaults: tools dir %s missing; skipping default seed",
|
||||
"MCPRegistry._seed_defaults: tools dir %s missing; skipping default seed",
|
||||
tools_dir,
|
||||
)
|
||||
return []
|
||||
@@ -115,14 +166,37 @@ class MCPRegistry:
|
||||
for canonical, stale in _STALE_DEFAULT_ALIASES.items():
|
||||
if stale in existing and canonical not in existing:
|
||||
logger.info(
|
||||
"MCPRegistry.ensure_defaults: removing stale alias '%s' (canonical: '%s')",
|
||||
"MCPRegistry._seed_defaults: removing stale alias '%s' (canonical: '%s')",
|
||||
stale,
|
||||
canonical,
|
||||
)
|
||||
del existing[stale]
|
||||
mutated = True
|
||||
|
||||
repaired: list[str] = []
|
||||
for name, spec in _DEFAULT_LOCAL_SERVERS.items():
|
||||
entry = existing.get(name)
|
||||
if entry is None:
|
||||
continue
|
||||
if self._default_entry_runnable(entry, tools_dir, list(spec["args"])):
|
||||
continue
|
||||
existing[name] = self._build_default_entry(
|
||||
name=name,
|
||||
spec=spec,
|
||||
cwd=cwd,
|
||||
preserve_from=entry,
|
||||
)
|
||||
repaired.append(name)
|
||||
mutated = True
|
||||
|
||||
if mutated:
|
||||
self._write_installed(data)
|
||||
if repaired:
|
||||
logger.warning(
|
||||
"MCPRegistry._seed_defaults: repaired %d default server(s) with unreachable cwd/script: %s",
|
||||
len(repaired),
|
||||
repaired,
|
||||
)
|
||||
|
||||
for name, spec in _DEFAULT_LOCAL_SERVERS.items():
|
||||
if name in existing:
|
||||
@@ -138,12 +212,97 @@ class MCPRegistry:
|
||||
)
|
||||
added.append(name)
|
||||
except MCPError as exc:
|
||||
logger.warning("MCPRegistry.ensure_defaults: failed to seed '%s': %s", name, exc)
|
||||
logger.warning("MCPRegistry._seed_defaults: failed to seed '%s': %s", name, exc)
|
||||
|
||||
if added:
|
||||
logger.info("MCPRegistry: seeded default local servers: %s", added)
|
||||
return added
|
||||
|
||||
@staticmethod
|
||||
def _default_entry_runnable(entry: dict, tools_dir: Path, canonical_args: list[str]) -> bool:
|
||||
"""Return True iff ``entry`` can plausibly be spawned on this machine.
|
||||
|
||||
Checks:
|
||||
- transport is stdio (only stdio defaults exist today; non-stdio
|
||||
gets a free pass since we have nothing to compare against)
|
||||
- stdio.cwd is an existing directory
|
||||
- the entry script (the first ``.py`` arg, e.g. ``files_server.py``)
|
||||
exists relative to that cwd
|
||||
|
||||
We deliberately do NOT spawn the subprocess here — this runs on
|
||||
every read path and must be cheap. A filesystem reachability
|
||||
check catches the cross-machine `cwd` drift that is the common
|
||||
failure, without flapping on transient runtime errors.
|
||||
"""
|
||||
transport = entry.get("transport") or "stdio"
|
||||
if transport != "stdio":
|
||||
return True
|
||||
manifest = entry.get("manifest") or {}
|
||||
stdio = manifest.get("stdio") or {}
|
||||
cwd_str = stdio.get("cwd")
|
||||
if not cwd_str:
|
||||
return False
|
||||
cwd_path = Path(cwd_str)
|
||||
if not cwd_path.is_dir():
|
||||
return False
|
||||
# Find the script: the first arg ending in .py, falling back to the
|
||||
# canonical spec if the registered args are unrecognizable. Modules
|
||||
# invoked via `python -m foo.bar` (no .py arg) are accepted as long
|
||||
# as the cwd exists — we can't cheaply prove the module imports.
|
||||
registered_args = stdio.get("args") or []
|
||||
script: str | None = next(
|
||||
(a for a in registered_args if isinstance(a, str) and a.endswith(".py")),
|
||||
None,
|
||||
)
|
||||
if script is None:
|
||||
script = next(
|
||||
(a for a in canonical_args if isinstance(a, str) and a.endswith(".py")),
|
||||
None,
|
||||
)
|
||||
if script is None:
|
||||
return True
|
||||
return (cwd_path / script).is_file()
|
||||
|
||||
@classmethod
|
||||
def _build_default_entry(
|
||||
cls,
|
||||
*,
|
||||
name: str,
|
||||
spec: dict[str, Any],
|
||||
cwd: str,
|
||||
preserve_from: dict | None,
|
||||
) -> dict:
|
||||
"""Construct a fresh canonical entry for a default server.
|
||||
|
||||
When ``preserve_from`` is provided, carries over the user's
|
||||
``enabled`` flag and ``overrides`` so a deliberate disable or
|
||||
custom env var survives the repair.
|
||||
"""
|
||||
manifest = {
|
||||
"name": name,
|
||||
"description": spec["description"],
|
||||
"transport": {"supported": ["stdio"], "default": "stdio"},
|
||||
"stdio": {
|
||||
"command": "uv",
|
||||
"args": list(spec["args"]),
|
||||
"env": {},
|
||||
"cwd": cwd,
|
||||
},
|
||||
}
|
||||
entry = cls._make_entry(
|
||||
source="local",
|
||||
manifest=manifest,
|
||||
transport="stdio",
|
||||
installed_by="hive mcp init (auto-repair)",
|
||||
)
|
||||
if preserve_from is not None:
|
||||
if "enabled" in preserve_from:
|
||||
entry["enabled"] = bool(preserve_from["enabled"])
|
||||
prior_overrides = preserve_from.get("overrides")
|
||||
if isinstance(prior_overrides, dict):
|
||||
entry["overrides"] = prior_overrides
|
||||
return entry
|
||||
|
||||
# ── Internal I/O ────────────────────────────────────────────────
|
||||
|
||||
def _read_installed(self) -> dict:
|
||||
|
||||
@@ -71,25 +71,36 @@ class ToolRegistry:
|
||||
{
|
||||
# File system reads
|
||||
"read_file",
|
||||
"list_directory",
|
||||
"grep",
|
||||
"glob",
|
||||
# Web reads
|
||||
"web_search",
|
||||
"web_fetch",
|
||||
"search_files",
|
||||
"pdf_read",
|
||||
# Terminal reads (rg / find / output buffer polling — neither
|
||||
# changes process state)
|
||||
"terminal_rg",
|
||||
"terminal_find",
|
||||
"terminal_output_get",
|
||||
# Web / research reads (re-issuable, side-effect-free fetches)
|
||||
"web_scrape",
|
||||
"search_papers",
|
||||
"search_wikipedia",
|
||||
"download_paper",
|
||||
# Browser read-only snapshots (mutate-free observations)
|
||||
"browser_screenshot",
|
||||
"browser_snapshot",
|
||||
"browser_console",
|
||||
"browser_get_text",
|
||||
# Background bash polling - reads output buffers only, does
|
||||
# not touch the subprocess itself.
|
||||
"bash_output",
|
||||
"browser_html",
|
||||
"browser_get_attribute",
|
||||
"browser_get_rect",
|
||||
}
|
||||
)
|
||||
|
||||
# Credential directory used for change detection
|
||||
_CREDENTIAL_DIR = Path("~/.hive/credentials/credentials").expanduser()
|
||||
# Credential directory used for change detection. Resolved at attribute
|
||||
# access so HIVE_HOME overrides (set by the desktop) are honoured.
|
||||
@property
|
||||
def _CREDENTIAL_DIR(self) -> Path:
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
return HIVE_HOME / "credentials" / "credentials"
|
||||
|
||||
def __init__(self):
|
||||
self._tools: dict[str, RegisteredTool] = {}
|
||||
@@ -457,7 +468,7 @@ class ToolRegistry:
|
||||
else:
|
||||
resolved_cwd = (base_dir / cwd).resolve()
|
||||
|
||||
# Find .py script in args (e.g. coder_tools_server.py, files_server.py)
|
||||
# Find .py script in args (e.g. files_server.py)
|
||||
script_name = None
|
||||
for i, arg in enumerate(args):
|
||||
if isinstance(arg, str) and arg.endswith(".py"):
|
||||
@@ -497,24 +508,6 @@ class ToolRegistry:
|
||||
config["cwd"] = str(resolved_cwd)
|
||||
return config
|
||||
|
||||
# For coder_tools_server, inject --project-root so reads land
|
||||
# in the expected workspace (hive repo, for framework skills
|
||||
# and docs), and inject --write-root so writes land under
|
||||
# ~/.hive/workspace/ instead of polluting the git checkout
|
||||
# with queen-authored skills, ledgers, and scripts. Without
|
||||
# the split, every ``write_file`` call from the queen landed
|
||||
# in the hive repo root.
|
||||
if script_name and "coder_tools" in script_name:
|
||||
project_root = str(resolved_cwd.parent.resolve())
|
||||
args = list(args)
|
||||
if "--project-root" not in args:
|
||||
args.extend(["--project-root", project_root])
|
||||
if "--write-root" not in args:
|
||||
_write_root = Path.home() / ".hive" / "workspace"
|
||||
_write_root.mkdir(parents=True, exist_ok=True)
|
||||
args.extend(["--write-root", str(_write_root)])
|
||||
config["args"] = args
|
||||
|
||||
if os.name == "nt":
|
||||
# Windows: cwd=None avoids WinError 267; use absolute script path
|
||||
config["cwd"] = None
|
||||
|
||||
@@ -89,10 +89,12 @@ class ActiveNodeClientIO(NodeClientIO):
|
||||
self._input_result = None
|
||||
|
||||
if self._event_bus is not None:
|
||||
# `prompt` is consumed by the caller separately (callers emit
|
||||
# it as a text delta when needed). The event only carries the
|
||||
# structured questions payload for widget rendering.
|
||||
await self._event_bus.emit_client_input_requested(
|
||||
stream_id=self.node_id,
|
||||
node_id=self.node_id,
|
||||
prompt=prompt,
|
||||
execution_id=self._execution_id or None,
|
||||
)
|
||||
|
||||
|
||||
@@ -29,9 +29,7 @@ _ALWAYS_AVAILABLE_TOOLS: frozenset[str] = frozenset(
|
||||
"read_file",
|
||||
"write_file",
|
||||
"edit_file",
|
||||
"list_directory",
|
||||
"search_files",
|
||||
"hashline_edit",
|
||||
"set_output",
|
||||
"escalate",
|
||||
}
|
||||
|
||||
@@ -9,8 +9,8 @@ Nodes that need browser access declare ``tools: {policy: "all"}`` in their
|
||||
agent.json config.
|
||||
|
||||
Note: the canonical source of truth for browser automation guidance is
|
||||
the ``browser-automation`` default skill at
|
||||
``core/framework/skills/_default_skills/browser-automation/SKILL.md``.
|
||||
the ``browser-automation`` preset skill at
|
||||
``core/framework/skills/_preset_skills/browser-automation/SKILL.md``.
|
||||
Activate that skill for the full decision tree. This module holds a
|
||||
compact subset suitable for direct inlining into a node's system prompt
|
||||
when a skill activation is not desired.
|
||||
@@ -35,7 +35,7 @@ Follow these rules for reliable, efficient browser interaction.
|
||||
Use snapshot first for structure and ordinary controls; switch to
|
||||
screenshot when snapshot can't find or verify the target. Interaction
|
||||
tools (`browser_click`, `browser_type`, `browser_type_focused`,
|
||||
`browser_fill`, `browser_scroll`) wait 0.5 s for the page to settle
|
||||
`browser_scroll`) wait 0.5 s for the page to settle
|
||||
after a successful action, then attach a fresh snapshot under the
|
||||
`snapshot` key of their result — so don't call `browser_snapshot`
|
||||
separately after an interaction unless you need a newer view. Tune
|
||||
@@ -140,8 +140,9 @@ shortcut dispatcher requires both), then releases in reverse order.
|
||||
## Tab management
|
||||
|
||||
Close tabs as soon as you're done with them — not only at the end of
|
||||
the task. `browser_close(target_id=...)` for one, `browser_close_finished()`
|
||||
for a full cleanup. Never accumulate more than 3 open tabs.
|
||||
the task. Use `browser_close(tab_id=...)` (or no arg to close the
|
||||
active tab); call it for each tab when cleaning up after a multi-tab
|
||||
workflow. Never accumulate more than 3 open tabs.
|
||||
`browser_tabs` reports an `origin` field: `"agent"` (you own it, close
|
||||
when done), `"popup"` (close after extracting), `"startup"`/`"user"`
|
||||
(leave alone).
|
||||
@@ -157,7 +158,7 @@ cookie consent banners if they block content.
|
||||
- If `browser_snapshot` fails, try `browser_get_text` with a narrow
|
||||
selector as fallback.
|
||||
- If `browser_open` fails or the page seems stale, `browser_stop` →
|
||||
`browser_start` → retry.
|
||||
`browser_open(url)` to lazy-create a fresh context.
|
||||
|
||||
## `browser_evaluate`
|
||||
|
||||
|
||||
@@ -543,6 +543,10 @@ class NodeContext:
|
||||
# Dynamic memory provider — when set, EventLoopNode rebuilds the
|
||||
# system prompt with the latest memory block each iteration.
|
||||
dynamic_memory_provider: Any = None # Callable[[], str] | None
|
||||
# Surgical skills-catalog refresh, same contract as AgentContext's
|
||||
# field of the same name. Lets workers pick up UI-driven skill
|
||||
# toggles without rebuilding the full system prompt each turn.
|
||||
dynamic_skills_catalog_provider: Any = None # Callable[[], str] | None
|
||||
|
||||
# Skill system prompts — injected by the skill discovery pipeline
|
||||
skills_catalog_prompt: str = "" # Available skills XML catalog
|
||||
|
||||
@@ -331,10 +331,10 @@ class Orchestrator:
|
||||
|
||||
# Strip tool names that aren't registered in this runtime instead of
|
||||
# hard-failing. The worker is forked from the queen's tool snapshot
|
||||
# which may include MCP tools the worker's runtime doesn't load (e.g.
|
||||
# coder-tools agent-management tools). Blocking the worker on missing
|
||||
# tools leaves the queen stranded mid-task; stripping + warning lets
|
||||
# the worker proceed with what it does have.
|
||||
# which may include MCP tools the worker's runtime doesn't load.
|
||||
# Blocking the worker on missing tools leaves the queen stranded
|
||||
# mid-task; stripping + warning lets the worker proceed with what
|
||||
# it does have.
|
||||
for node in graph.nodes:
|
||||
if node.id not in reachable:
|
||||
continue
|
||||
@@ -683,11 +683,10 @@ class Orchestrator:
|
||||
# Set per-execution data_dir and agent_id so data tools and
|
||||
# spillover files share the same session-scoped directory, and
|
||||
# so MCP tools whose server-side schemas mark agent_id as a
|
||||
# required field (list_dir, hashline_edit, replace_file_content,
|
||||
# execute_command_tool, …) get a valid value injected even on
|
||||
# registry instances where agent_loader.setup() didn't populate
|
||||
# the session_context. Without this, FastMCP rejects those
|
||||
# calls with "agent_id is a required property".
|
||||
# required field get a valid value injected even on registry
|
||||
# instances where agent_loader.setup() didn't populate the
|
||||
# session_context. Without this, FastMCP rejects those calls
|
||||
# with "agent_id is a required property".
|
||||
_ctx_token = None
|
||||
if self._storage_path:
|
||||
from framework.loader.tool_registry import ToolRegistry
|
||||
|
||||
@@ -44,6 +44,9 @@ class McpRegistryStage(PipelineStage):
|
||||
from framework.loader.mcp_registry import MCPRegistry
|
||||
from framework.orchestrator.files import FILES_MCP_SERVER_NAME
|
||||
|
||||
# Bundled defaults (hive_tools / gcu-tools / files-tools / shell-tools)
|
||||
# are seeded inside MCPRegistry.initialize(); resolve_for_agent below
|
||||
# will find them even on a fresh HIVE_HOME.
|
||||
registry = MCPRegistry()
|
||||
mcp_loaded = False
|
||||
|
||||
|
||||
@@ -26,11 +26,15 @@ class SkillRegistryStage(PipelineStage):
|
||||
project_root: str | Path | None = None,
|
||||
interactive: bool = True,
|
||||
skills_config: Any = None,
|
||||
extra_scope_dirs: list[Any] | None = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
self._project_root = Path(project_root) if project_root else None
|
||||
self._interactive = interactive
|
||||
self._skills_config = skills_config
|
||||
# Optional list of ExtraScope entries layered between user and
|
||||
# project scope (e.g. ``colony_ui`` for a colony agent's skills/).
|
||||
self._extra_scope_dirs = list(extra_scope_dirs) if extra_scope_dirs else []
|
||||
self.skills_manager: Any = None
|
||||
|
||||
async def initialize(self) -> None:
|
||||
@@ -41,6 +45,7 @@ class SkillRegistryStage(PipelineStage):
|
||||
skills_config=self._skills_config or SkillsConfig(),
|
||||
project_root=self._project_root,
|
||||
interactive=self._interactive,
|
||||
extra_scope_dirs=self._extra_scope_dirs,
|
||||
)
|
||||
self.skills_manager = SkillsManager(config)
|
||||
self.skills_manager.load()
|
||||
|
||||
@@ -155,6 +155,17 @@ class SessionState(BaseModel):
|
||||
# True after first successful worker execution (gates trigger delivery on restart)
|
||||
worker_configured: bool = Field(default=False)
|
||||
|
||||
# Task-system fields (see framework/tasks).
|
||||
# task_list_id: this session's own task list id (populated on first
|
||||
# task_create; immutable thereafter). Used for resume reattachment —
|
||||
# if it differs from resolve_task_list_id(ctx) on resume, a
|
||||
# TASK_LIST_REATTACH_MISMATCH event is emitted and a fresh list is
|
||||
# created at the resolved id (the orphan stays on disk).
|
||||
task_list_id: str | None = None
|
||||
# picked_up_from: for worker sessions, the (colony_task_list_id,
|
||||
# template_task_id) pair this session was spawned for.
|
||||
picked_up_from: list[Any] | None = None
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
@property
|
||||
|
||||
+124
-17
@@ -1,5 +1,6 @@
|
||||
"""aiohttp Application factory for the Hive HTTP API server."""
|
||||
|
||||
import hmac
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
@@ -19,22 +20,31 @@ _REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
|
||||
_ALLOWED_AGENT_ROOTS: tuple[Path, ...] | None = None
|
||||
|
||||
|
||||
def _has_encrypted_credentials() -> bool:
|
||||
"""Return True when an encrypted credential store already exists on disk."""
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
cred_dir = HIVE_HOME / "credentials" / "credentials"
|
||||
return cred_dir.is_dir() and any(cred_dir.glob("*.enc"))
|
||||
|
||||
|
||||
def _get_allowed_agent_roots() -> tuple[Path, ...]:
|
||||
"""Return resolved allowed root directories for agent loading.
|
||||
|
||||
Roots are anchored to the repository root (derived from ``__file__``)
|
||||
so the allowlist is correct regardless of the process's working
|
||||
directory.
|
||||
directory. The hive-home subtrees honour ``HIVE_HOME`` so the desktop's
|
||||
per-user root is allowed in addition to (or instead of) ``~/.hive``.
|
||||
"""
|
||||
global _ALLOWED_AGENT_ROOTS
|
||||
if _ALLOWED_AGENT_ROOTS is None:
|
||||
from framework.config import COLONIES_DIR
|
||||
from framework.config import COLONIES_DIR, HIVE_HOME
|
||||
|
||||
_ALLOWED_AGENT_ROOTS = (
|
||||
COLONIES_DIR.resolve(), # ~/.hive/colonies/
|
||||
COLONIES_DIR.resolve(), # $HIVE_HOME/colonies/
|
||||
(_REPO_ROOT / "exports").resolve(), # compat fallback
|
||||
(_REPO_ROOT / "examples").resolve(),
|
||||
(Path.home() / ".hive" / "agents").resolve(),
|
||||
(HIVE_HOME / "agents").resolve(),
|
||||
)
|
||||
return _ALLOWED_AGENT_ROOTS
|
||||
|
||||
@@ -56,7 +66,8 @@ def validate_agent_path(agent_path: str | Path) -> Path:
|
||||
if resolved.is_relative_to(root) and resolved != root:
|
||||
return resolved
|
||||
raise ValueError(
|
||||
"agent_path must be inside an allowed directory (~/.hive/colonies/, exports/, examples/, or ~/.hive/agents/)"
|
||||
"agent_path must be inside an allowed directory "
|
||||
"($HIVE_HOME/colonies/, exports/, examples/, or $HIVE_HOME/agents/)"
|
||||
)
|
||||
|
||||
|
||||
@@ -88,13 +99,15 @@ def resolve_session(request: web.Request):
|
||||
def sessions_dir(session: Session) -> Path:
|
||||
"""Resolve the worker sessions directory for a session.
|
||||
|
||||
Storage layout: ~/.hive/agents/{agent_name}/sessions/
|
||||
Storage layout: $HIVE_HOME/agents/{agent_name}/sessions/
|
||||
Requires a worker to be loaded (worker_path must be set).
|
||||
"""
|
||||
if session.worker_path is None:
|
||||
raise ValueError("No worker loaded — no worker sessions directory")
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
agent_name = session.worker_path.name
|
||||
return Path.home() / ".hive" / "agents" / agent_name / "sessions"
|
||||
return HIVE_HOME / "agents" / agent_name / "sessions"
|
||||
|
||||
|
||||
# Allowed CORS origins (localhost on any port)
|
||||
@@ -134,6 +147,47 @@ async def cors_middleware(request: web.Request, handler):
|
||||
return response
|
||||
|
||||
|
||||
@web.middleware
|
||||
async def no_cache_api_middleware(request: web.Request, handler):
|
||||
"""Prevent browsers from caching API responses.
|
||||
|
||||
Without this, a one-off bad response (e.g. the SPA catch-all leaking
|
||||
index.html for an /api/* URL before a route was registered) can get
|
||||
pinned in the browser's disk cache and replayed forever, since our
|
||||
JSON handlers don't emit ETag/Last-Modified and browsers fall back
|
||||
to heuristic freshness.
|
||||
"""
|
||||
try:
|
||||
response = await handler(request)
|
||||
except web.HTTPException as exc:
|
||||
response = exc
|
||||
if request.path.startswith("/api/"):
|
||||
response.headers["Cache-Control"] = "no-store"
|
||||
return response
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Desktop shared-secret auth middleware.
|
||||
#
|
||||
# When the runtime is spawned by the Electron main process, a fresh random
|
||||
# token is passed via ``HIVE_DESKTOP_TOKEN``. Every request from main must
|
||||
# carry the matching ``X-Hive-Token`` header. If the env var is unset (e.g.
|
||||
# running ``hive serve`` directly from a terminal), the check is skipped —
|
||||
# OSS behaviour is preserved.
|
||||
# ---------------------------------------------------------------------------
|
||||
_EXPECTED_DESKTOP_TOKEN: str | None = os.environ.get("HIVE_DESKTOP_TOKEN") or None
|
||||
|
||||
|
||||
@web.middleware
|
||||
async def desktop_auth_middleware(request: web.Request, handler):
|
||||
if _EXPECTED_DESKTOP_TOKEN is None:
|
||||
return await handler(request)
|
||||
provided = request.headers.get("X-Hive-Token", "")
|
||||
if not hmac.compare_digest(provided, _EXPECTED_DESKTOP_TOKEN):
|
||||
return web.json_response({"error": "unauthorized"}, status=401)
|
||||
return await handler(request)
|
||||
|
||||
|
||||
@web.middleware
|
||||
async def error_middleware(request: web.Request, handler):
|
||||
"""Catch exceptions and return JSON error responses.
|
||||
@@ -262,7 +316,12 @@ def create_app(model: str | None = None) -> web.Application:
|
||||
Returns:
|
||||
Configured aiohttp Application ready to run.
|
||||
"""
|
||||
app = web.Application(middlewares=[cors_middleware, error_middleware])
|
||||
# Desktop mode: the runtime is always a subprocess of the Electron main
|
||||
# process, which reaches it via IPC and the `hive://` custom protocol.
|
||||
# There is no browser origin to authorize, so CORS is unnecessary.
|
||||
# The auth middleware enforces the shared-secret token when the env var
|
||||
# is set (i.e. when Electron spawned us); it is a no-op otherwise.
|
||||
app = web.Application(middlewares=[desktop_auth_middleware, no_cache_api_middleware, error_middleware])
|
||||
|
||||
# Initialize credential store (before SessionManager so it can be shared)
|
||||
from framework.credentials.store import CredentialStore
|
||||
@@ -275,17 +334,26 @@ def create_app(model: str | None = None) -> web.Application:
|
||||
|
||||
# Auto-generate credential key for web-only users who never ran the TUI
|
||||
if not os.environ.get("HIVE_CREDENTIAL_KEY"):
|
||||
try:
|
||||
from framework.credentials.key_storage import generate_and_save_credential_key
|
||||
if _has_encrypted_credentials():
|
||||
logger.warning(
|
||||
"HIVE_CREDENTIAL_KEY is missing but encrypted credentials already exist; "
|
||||
"not generating a replacement key because it would not decrypt existing credentials"
|
||||
)
|
||||
else:
|
||||
try:
|
||||
from framework.credentials.key_storage import generate_and_save_credential_key
|
||||
|
||||
generate_and_save_credential_key()
|
||||
logger.info("Generated and persisted HIVE_CREDENTIAL_KEY to ~/.hive/secrets/credential_key")
|
||||
except Exception as exc:
|
||||
logger.warning("Could not auto-persist HIVE_CREDENTIAL_KEY: %s", exc)
|
||||
generate_and_save_credential_key()
|
||||
logger.info("Generated and persisted HIVE_CREDENTIAL_KEY to ~/.hive/secrets/credential_key")
|
||||
except Exception as exc:
|
||||
logger.warning("Could not auto-persist HIVE_CREDENTIAL_KEY: %s", exc)
|
||||
|
||||
# Local server startup should not wait on an eager Aden sync.
|
||||
# The store can still fetch/refresh credentials on demand.
|
||||
credential_store = CredentialStore.with_aden_sync(auto_sync=False)
|
||||
if not os.environ.get("HIVE_CREDENTIAL_KEY") and _has_encrypted_credentials():
|
||||
credential_store = CredentialStore.with_env_storage()
|
||||
else:
|
||||
credential_store = CredentialStore.with_aden_sync(auto_sync=False)
|
||||
except Exception:
|
||||
logger.debug("Encrypted credential store unavailable, using in-memory fallback")
|
||||
credential_store = CredentialStore.for_testing({})
|
||||
@@ -301,6 +369,18 @@ def create_app(model: str | None = None) -> web.Application:
|
||||
queen_tool_registry=None,
|
||||
)
|
||||
|
||||
# Clear orphaned compaction markers from prior server crashes. Without
|
||||
# this, any session whose compaction was interrupted would block the
|
||||
# next colony cold-load for the full await_completion timeout (180s)
|
||||
# before falling through. See compaction_status.sweep_stale_in_progress.
|
||||
try:
|
||||
from framework.config import QUEENS_DIR
|
||||
from framework.server import compaction_status
|
||||
|
||||
compaction_status.sweep_stale_in_progress(QUEENS_DIR)
|
||||
except Exception:
|
||||
logger.debug("compaction_status: startup sweep skipped", exc_info=True)
|
||||
|
||||
# Register shutdown hook
|
||||
app.on_shutdown.append(_on_shutdown)
|
||||
|
||||
@@ -310,16 +390,22 @@ def create_app(model: str | None = None) -> web.Application:
|
||||
app.router.add_get("/api/browser/status/stream", handle_browser_status_stream)
|
||||
|
||||
# Register route modules
|
||||
from framework.server.routes_colonies import register_routes as register_colonies_routes
|
||||
from framework.server.routes_colony_tools import register_routes as register_colony_tools_routes
|
||||
from framework.server.routes_colony_workers import register_routes as register_colony_worker_routes
|
||||
from framework.server.routes_config import register_routes as register_config_routes
|
||||
from framework.server.routes_credentials import register_routes as register_credential_routes
|
||||
from framework.server.routes_events import register_routes as register_event_routes
|
||||
from framework.server.routes_execution import register_routes as register_execution_routes
|
||||
from framework.server.routes_logs import register_routes as register_log_routes
|
||||
from framework.server.routes_mcp import register_routes as register_mcp_routes
|
||||
from framework.server.routes_messages import register_routes as register_message_routes
|
||||
from framework.server.routes_prompts import register_routes as register_prompt_routes
|
||||
from framework.server.routes_queen_tools import register_routes as register_queen_tools_routes
|
||||
from framework.server.routes_queens import register_routes as register_queen_routes
|
||||
from framework.server.routes_sessions import register_routes as register_session_routes
|
||||
from framework.server.routes_skills import register_routes as register_skills_routes
|
||||
from framework.server.routes_tasks import register_routes as register_task_routes
|
||||
from framework.server.routes_workers import register_routes as register_worker_routes
|
||||
|
||||
register_config_routes(app)
|
||||
@@ -331,11 +417,32 @@ def create_app(model: str | None = None) -> web.Application:
|
||||
register_worker_routes(app)
|
||||
register_log_routes(app)
|
||||
register_queen_routes(app)
|
||||
register_queen_tools_routes(app)
|
||||
register_colonies_routes(app)
|
||||
register_colony_tools_routes(app)
|
||||
register_mcp_routes(app)
|
||||
register_colony_worker_routes(app)
|
||||
register_prompt_routes(app)
|
||||
register_skills_routes(app)
|
||||
register_task_routes(app)
|
||||
|
||||
# Static file serving — Option C production mode
|
||||
# If frontend/dist/ exists, serve built frontend files on /
|
||||
# Commercial extensions (optional — only present in hive-desktop-runtime).
|
||||
# Imports lazily so an OSS install without the `commercial` package keeps
|
||||
# working unchanged.
|
||||
try:
|
||||
from commercial.middleware import setup_commercial_middleware
|
||||
from commercial.routes import register_routes as register_commercial_routes
|
||||
|
||||
setup_commercial_middleware(app)
|
||||
register_commercial_routes(app)
|
||||
logger.info("Commercial extensions loaded")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Serve the built frontend SPA (if frontend/dist exists) so hitting the
|
||||
# API host in a browser loads the dashboard instead of 404'ing. In
|
||||
# Electron/desktop mode the renderer still loads from file:// and
|
||||
# ignores this; in dev mode Vite is used instead.
|
||||
_setup_static_serving(app)
|
||||
|
||||
return app
|
||||
|
||||
@@ -0,0 +1,201 @@
|
||||
"""Track fork-compaction status for freshly-forked colony queen sessions.
|
||||
|
||||
When ``create_colony`` forks a queen session into a colony, the
|
||||
inherited DM transcript is compacted via an LLM call that can legitimately
|
||||
exceed the default tool-call timeout (60s). To keep ``create_colony``
|
||||
responsive we run that compaction in the background and record its
|
||||
status on disk so a subsequent colony session-load can wait for it to
|
||||
settle before reading the conversation files.
|
||||
|
||||
The status lives at ``<queen_dir>/compaction_status.json``:
|
||||
|
||||
{"status": "in_progress", "started_at": "..."}
|
||||
{"status": "done", "completed_at": "...", "messages_compacted": N, "summary_chars": M}
|
||||
{"status": "failed", "completed_at": "...", "error": "..."}
|
||||
|
||||
Only present when a compaction was scheduled for this queen dir — absent
|
||||
otherwise. All writes are fail-soft; a missing/corrupt file is treated
|
||||
as "no compaction pending".
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_STATUS_FILENAME = "compaction_status.json"
|
||||
|
||||
|
||||
def _status_path(queen_dir: Path) -> Path:
|
||||
return Path(queen_dir) / _STATUS_FILENAME
|
||||
|
||||
|
||||
def mark_in_progress(queen_dir: Path) -> None:
|
||||
path = _status_path(queen_dir)
|
||||
try:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"status": "in_progress",
|
||||
"started_at": datetime.now(UTC).isoformat(),
|
||||
},
|
||||
ensure_ascii=False,
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
except OSError:
|
||||
logger.warning(
|
||||
"compaction_status: failed to write 'in_progress' at %s",
|
||||
path,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
|
||||
def mark_done(
|
||||
queen_dir: Path,
|
||||
*,
|
||||
messages_compacted: int = 0,
|
||||
summary_chars: int = 0,
|
||||
) -> None:
|
||||
path = _status_path(queen_dir)
|
||||
try:
|
||||
path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"status": "done",
|
||||
"completed_at": datetime.now(UTC).isoformat(),
|
||||
"messages_compacted": messages_compacted,
|
||||
"summary_chars": summary_chars,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
except OSError:
|
||||
logger.warning(
|
||||
"compaction_status: failed to write 'done' at %s",
|
||||
path,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
|
||||
def mark_failed(queen_dir: Path, error: str) -> None:
|
||||
path = _status_path(queen_dir)
|
||||
try:
|
||||
path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"status": "failed",
|
||||
"completed_at": datetime.now(UTC).isoformat(),
|
||||
"error": (error or "")[:500],
|
||||
},
|
||||
ensure_ascii=False,
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
except OSError:
|
||||
logger.warning(
|
||||
"compaction_status: failed to write 'failed' at %s",
|
||||
path,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
|
||||
def get_status(queen_dir: Path) -> dict | None:
|
||||
path = _status_path(queen_dir)
|
||||
if not path.exists():
|
||||
return None
|
||||
try:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return None
|
||||
|
||||
|
||||
async def await_completion(
|
||||
queen_dir: Path,
|
||||
*,
|
||||
timeout: float = 180.0,
|
||||
poll: float = 0.5,
|
||||
) -> dict | None:
|
||||
"""Block until compaction leaves 'in_progress' state.
|
||||
|
||||
Returns the final status dict, or ``None`` if no compaction marker
|
||||
exists for this dir. On timeout returns the last observed status
|
||||
(still 'in_progress') so the caller can decide whether to proceed
|
||||
with the raw transcript.
|
||||
"""
|
||||
loop = asyncio.get_event_loop()
|
||||
deadline = loop.time() + max(0.0, timeout)
|
||||
last: dict | None = None
|
||||
while True:
|
||||
last = get_status(queen_dir)
|
||||
if last is None:
|
||||
return None
|
||||
if last.get("status") != "in_progress":
|
||||
return last
|
||||
if loop.time() >= deadline:
|
||||
logger.warning(
|
||||
"compaction_status: timed out after %.0fs waiting for %s (proceeding with raw transcript)",
|
||||
timeout,
|
||||
queen_dir,
|
||||
)
|
||||
return last
|
||||
await asyncio.sleep(poll)
|
||||
|
||||
|
||||
def sweep_stale_in_progress(queens_root: Path) -> int:
|
||||
"""Rewrite any orphaned ``in_progress`` markers under ``queens_root`` to
|
||||
``failed``. Returns the count of rewritten markers.
|
||||
|
||||
Whatever process owned the original compaction is gone (server crash,
|
||||
SIGKILL, etc.), so leaving the marker at ``in_progress`` would cause every
|
||||
subsequent colony cold-load for that queen session to wait the full
|
||||
``await_completion`` timeout (default 180s) before falling through.
|
||||
|
||||
Called once during server bootstrap. Best-effort: any per-file failure is
|
||||
logged and skipped — the sweep should never prevent the server from
|
||||
coming up.
|
||||
"""
|
||||
if not queens_root.exists():
|
||||
return 0
|
||||
cleaned = 0
|
||||
try:
|
||||
for queen_dir in queens_root.iterdir():
|
||||
if not queen_dir.is_dir():
|
||||
continue
|
||||
sessions_dir = queen_dir / "sessions"
|
||||
if not sessions_dir.exists():
|
||||
continue
|
||||
try:
|
||||
for session_dir in sessions_dir.iterdir():
|
||||
if not session_dir.is_dir():
|
||||
continue
|
||||
status = get_status(session_dir)
|
||||
if status is None or status.get("status") != "in_progress":
|
||||
continue
|
||||
mark_failed(session_dir, "server restarted while compaction was in progress")
|
||||
cleaned += 1
|
||||
except OSError:
|
||||
logger.debug(
|
||||
"compaction_status: sweep failed under %s",
|
||||
sessions_dir,
|
||||
exc_info=True,
|
||||
)
|
||||
except OSError:
|
||||
logger.debug(
|
||||
"compaction_status: sweep failed under %s",
|
||||
queens_root,
|
||||
exc_info=True,
|
||||
)
|
||||
if cleaned:
|
||||
logger.info(
|
||||
"compaction_status: cleared %d stale 'in_progress' marker(s) at startup",
|
||||
cleaned,
|
||||
)
|
||||
return cleaned
|
||||
@@ -113,10 +113,18 @@ def install_worker_escalation_routing(
|
||||
queen_node = session.queen_executor.node_registry.get("queen") if session.queen_executor is not None else None
|
||||
if queen_node is None or not hasattr(queen_node, "inject_event"):
|
||||
if session.event_bus is not None:
|
||||
# Stream the handoff text so the human sees the worker's
|
||||
# question, then request input so the reply input appears.
|
||||
await session.event_bus.emit_client_output_delta(
|
||||
stream_id="queen",
|
||||
node_id="queen",
|
||||
content=handoff,
|
||||
snapshot=handoff,
|
||||
execution_id=session.id,
|
||||
)
|
||||
await session.event_bus.emit_client_input_requested(
|
||||
stream_id="queen",
|
||||
node_id="queen",
|
||||
prompt=handoff,
|
||||
execution_id=session.id,
|
||||
)
|
||||
return
|
||||
@@ -245,6 +253,92 @@ async def materialize_queen_identity(
|
||||
)
|
||||
|
||||
|
||||
def build_queen_tool_registry_bare() -> tuple[Any, dict[str, list[dict[str, Any]]]]:
|
||||
"""Build a Queen ``ToolRegistry`` and a (server_name → tools) catalog.
|
||||
|
||||
Used by the Tool Library GET route to populate the MCP tool surface
|
||||
without needing a live queen session. We DO NOT register queen
|
||||
lifecycle tools here (they require a Session stub); the catalog only
|
||||
covers MCP-origin tools, which is what the allowlist gates.
|
||||
|
||||
Loading MCP servers spawns subprocesses, so call this once per
|
||||
backend process and cache the result.
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
import framework.agents.queen as _queen_pkg
|
||||
from framework.loader.mcp_registry import MCPRegistry
|
||||
from framework.loader.tool_registry import ToolRegistry
|
||||
|
||||
queen_registry = ToolRegistry()
|
||||
queen_pkg_dir = Path(_queen_pkg.__file__).parent
|
||||
|
||||
mcp_config = queen_pkg_dir / "mcp_servers.json"
|
||||
if mcp_config.exists():
|
||||
try:
|
||||
queen_registry.load_mcp_config(mcp_config)
|
||||
except Exception:
|
||||
logger.warning("build_queen_tool_registry_bare: MCP config failed", exc_info=True)
|
||||
|
||||
try:
|
||||
reg = MCPRegistry()
|
||||
reg.initialize()
|
||||
if (queen_pkg_dir / "mcp_registry.json").is_file():
|
||||
queen_registry.set_mcp_registry_agent_path(queen_pkg_dir)
|
||||
registry_configs, selection_max_tools = reg.load_agent_selection(queen_pkg_dir)
|
||||
|
||||
already = {cfg.get("name") for cfg in registry_configs if cfg.get("name")}
|
||||
extra: list[str] = []
|
||||
try:
|
||||
for entry in reg.list_installed():
|
||||
if entry.get("source") != "local":
|
||||
continue
|
||||
if not entry.get("enabled", True):
|
||||
continue
|
||||
name = entry.get("name")
|
||||
if name and name not in already:
|
||||
extra.append(name)
|
||||
except Exception:
|
||||
pass
|
||||
if extra:
|
||||
try:
|
||||
extra_configs = reg.resolve_for_agent(include=extra)
|
||||
registry_configs = list(registry_configs) + [reg._server_config_to_dict(c) for c in extra_configs]
|
||||
except Exception:
|
||||
logger.debug("build_queen_tool_registry_bare: resolve_for_agent(extra) failed", exc_info=True)
|
||||
|
||||
if registry_configs:
|
||||
queen_registry.load_registry_servers(
|
||||
registry_configs,
|
||||
preserve_existing_tools=True,
|
||||
log_collisions=False,
|
||||
max_tools=selection_max_tools,
|
||||
)
|
||||
except Exception:
|
||||
logger.warning("build_queen_tool_registry_bare: MCP registry load failed", exc_info=True)
|
||||
|
||||
# Build the catalog.
|
||||
tools_by_name = queen_registry.get_tools()
|
||||
server_map = dict(getattr(queen_registry, "_mcp_server_tools", {}) or {})
|
||||
catalog: dict[str, list[dict[str, Any]]] = {}
|
||||
for server_name in sorted(server_map):
|
||||
entries: list[dict[str, Any]] = []
|
||||
for tool_name in sorted(server_map[server_name]):
|
||||
tool = tools_by_name.get(tool_name)
|
||||
if tool is None:
|
||||
continue
|
||||
entries.append(
|
||||
{
|
||||
"name": tool.name,
|
||||
"description": tool.description,
|
||||
"input_schema": tool.parameters,
|
||||
}
|
||||
)
|
||||
catalog[server_name] = entries
|
||||
|
||||
return queen_registry, catalog
|
||||
|
||||
|
||||
async def create_queen(
|
||||
session: Session,
|
||||
session_manager: Any,
|
||||
@@ -266,21 +360,24 @@ async def create_queen(
|
||||
queen_loop_config as _base_loop_config,
|
||||
)
|
||||
from framework.agents.queen.nodes import (
|
||||
_QUEEN_INCUBATING_TOOLS,
|
||||
_QUEEN_INDEPENDENT_TOOLS,
|
||||
_QUEEN_REVIEWING_TOOLS,
|
||||
_QUEEN_WORKING_TOOLS,
|
||||
_queen_behavior_always,
|
||||
_queen_behavior_independent,
|
||||
_queen_character_core,
|
||||
_queen_role_incubating,
|
||||
_queen_role_independent,
|
||||
_queen_role_reviewing,
|
||||
_queen_role_working,
|
||||
_queen_style,
|
||||
_queen_tools_incubating,
|
||||
_queen_tools_independent,
|
||||
_queen_tools_reviewing,
|
||||
_queen_tools_working,
|
||||
finalize_queen_prompt,
|
||||
)
|
||||
from framework.config import get_max_tokens as _get_max_tokens
|
||||
from framework.host.event_bus import AgentEvent, EventType
|
||||
from framework.llm.capabilities import supports_image_tool_results
|
||||
from framework.loader.mcp_registry import MCPRegistry
|
||||
@@ -315,6 +412,45 @@ async def create_queen(
|
||||
if (queen_pkg_dir / "mcp_registry.json").is_file():
|
||||
queen_registry.set_mcp_registry_agent_path(queen_pkg_dir)
|
||||
registry_configs, selection_max_tools = registry.load_agent_selection(queen_pkg_dir)
|
||||
|
||||
# Auto-include every user-added local MCP server that the repo
|
||||
# selection hasn't already loaded. Users register servers via
|
||||
# the `/api/mcp/servers` route (or `hive mcp add`); they live in
|
||||
# ~/.hive/mcp_registry/installed.json with source == "local".
|
||||
# New servers take effect on the next queen session start; the
|
||||
# prompt cache and ToolRegistry are still loaded once per boot.
|
||||
already_loaded_names = {cfg.get("name") for cfg in registry_configs if cfg.get("name")}
|
||||
extra_names: list[str] = []
|
||||
try:
|
||||
for entry in registry.list_installed():
|
||||
if entry.get("source") != "local":
|
||||
continue
|
||||
if not entry.get("enabled", True):
|
||||
continue
|
||||
name = entry.get("name")
|
||||
if not name or name in already_loaded_names:
|
||||
continue
|
||||
extra_names.append(name)
|
||||
except Exception:
|
||||
logger.debug("Queen: list_installed() failed while auto-including user servers", exc_info=True)
|
||||
|
||||
if extra_names:
|
||||
try:
|
||||
extra_configs = registry.resolve_for_agent(include=extra_names)
|
||||
extra_dicts = [registry._server_config_to_dict(c) for c in extra_configs]
|
||||
registry_configs = list(registry_configs) + extra_dicts
|
||||
logger.info(
|
||||
"Queen: auto-including %d user-added MCP server(s): %s",
|
||||
len(extra_dicts),
|
||||
[c.get("name") for c in extra_dicts],
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"Queen: failed to resolve user-added MCP servers %s",
|
||||
extra_names,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
if registry_configs:
|
||||
results = queen_registry.load_registry_servers(
|
||||
registry_configs,
|
||||
@@ -352,6 +488,21 @@ async def create_queen(
|
||||
phase_state=phase_state,
|
||||
)
|
||||
|
||||
# ---- Task system tools --------------------------------------------
|
||||
# Every queen gets the four session task tools. Queens-of-colony
|
||||
# additionally get the colony_template_* tools (gated by colony_id).
|
||||
from framework.tasks.tools import (
|
||||
register_colony_template_tools,
|
||||
register_task_tools,
|
||||
)
|
||||
|
||||
register_task_tools(queen_registry)
|
||||
_colony_id_for_queen = getattr(session, "colony_id", None) or getattr(
|
||||
getattr(session, "colony_runtime", None), "_colony_id", None
|
||||
)
|
||||
if _colony_id_for_queen:
|
||||
register_colony_template_tools(queen_registry, colony_id=_colony_id_for_queen)
|
||||
|
||||
# ---- Colony runtime check (only when worker is loaded) ----------------
|
||||
if session.colony_runtime:
|
||||
from framework.tools.worker_monitoring_tools import register_worker_monitoring_tools
|
||||
@@ -378,6 +529,7 @@ async def create_queen(
|
||||
|
||||
# ---- Partition tools by phase ------------------------------------
|
||||
independent_names = set(_QUEEN_INDEPENDENT_TOOLS)
|
||||
incubating_names = set(_QUEEN_INCUBATING_TOOLS)
|
||||
working_names = set(_QUEEN_WORKING_TOOLS)
|
||||
reviewing_names = set(_QUEEN_REVIEWING_TOOLS)
|
||||
|
||||
@@ -386,16 +538,89 @@ async def create_queen(
|
||||
|
||||
phase_state.working_tools = [t for t in queen_tools if t.name in working_names]
|
||||
phase_state.reviewing_tools = [t for t in queen_tools if t.name in reviewing_names]
|
||||
# Incubating tool surface is intentionally minimal (read-only inspection
|
||||
# + create_colony + cancel_incubation) — no MCP tools spliced in, so the
|
||||
# queen stays focused on drafting the spec.
|
||||
phase_state.incubating_tools = [t for t in queen_tools if t.name in incubating_names]
|
||||
|
||||
# Independent phase gets core tools + all MCP tools not claimed by any
|
||||
# other phase (coder-tools file I/O, gcu-tools browser, etc.).
|
||||
all_phase_names = independent_names | working_names | reviewing_names
|
||||
# other phase (files-tools file I/O, gcu-tools browser, etc.).
|
||||
all_phase_names = independent_names | incubating_names | working_names | reviewing_names
|
||||
mcp_tools = [t for t in queen_tools if t.name not in all_phase_names]
|
||||
phase_state.independent_tools = [t for t in queen_tools if t.name in independent_names] + mcp_tools
|
||||
logger.info(
|
||||
"Queen: independent tools: %s",
|
||||
sorted(t.name for t in phase_state.independent_tools),
|
||||
)
|
||||
logger.info(
|
||||
"Queen: incubating tools: %s",
|
||||
sorted(t.name for t in phase_state.incubating_tools),
|
||||
)
|
||||
|
||||
# ---- Per-queen MCP tool allowlist --------------------------------
|
||||
# Capture the set of MCP-origin tool names so the allowlist in
|
||||
# ``QueenPhaseState`` only gates MCP tools (lifecycle and synthetic
|
||||
# tools always pass through). Then apply the queen profile's stored
|
||||
# allowlist (if any) and memoize the filtered independent tool list.
|
||||
mcp_server_tools_map: dict[str, set[str]] = dict(getattr(queen_registry, "_mcp_server_tools", {}))
|
||||
phase_state.mcp_tool_names_all = set().union(*mcp_server_tools_map.values()) if mcp_server_tools_map else set()
|
||||
# The queen's MCP tool allowlist now lives in a dedicated
|
||||
# ``tools.json`` sidecar next to ``profile.yaml``. ``load_queen_tools_config``
|
||||
# migrates any legacy ``enabled_mcp_tools`` field out of profile.yaml
|
||||
# on first read, so existing installs upgrade silently.
|
||||
from framework.agents.queen.queen_tools_config import load_queen_tools_config
|
||||
|
||||
# Build a minimal catalog for default-tool resolution. The full
|
||||
# ``session_manager._mcp_tool_catalog`` snapshot is written further
|
||||
# down the flow; a queen booted for the first time needs the catalog
|
||||
# now so ``@server:NAME`` shorthands in the role-default table can
|
||||
# expand against the just-loaded MCP servers.
|
||||
_boot_catalog: dict[str, list[dict]] = {
|
||||
srv: [{"name": name} for name in sorted(names)] for srv, names in mcp_server_tools_map.items()
|
||||
}
|
||||
# ``queen_dir`` is ``queens/<queen_id>/sessions/<session_id>``; the
|
||||
# allowlist sidecar is keyed by queen_id, not session_id.
|
||||
phase_state.enabled_mcp_tools = load_queen_tools_config(session.queen_name, _boot_catalog)
|
||||
phase_state.rebuild_independent_filter()
|
||||
if phase_state.enabled_mcp_tools is not None:
|
||||
total_mcp = len(phase_state.mcp_tool_names_all)
|
||||
allowed_mcp = len(set(phase_state.enabled_mcp_tools) & phase_state.mcp_tool_names_all)
|
||||
logger.info(
|
||||
"Queen: per-queen MCP allowlist active — %d of %d MCP tools enabled",
|
||||
allowed_mcp,
|
||||
total_mcp,
|
||||
)
|
||||
|
||||
# ---- MCP tool catalog for the frontend ---------------------------
|
||||
# Snapshot per-server tool metadata so the Queen Tools API can render
|
||||
# the tool surface without spawning MCP subprocesses. Keyed by server
|
||||
# name so the UI can group tools by origin. Updated every time a
|
||||
# queen boots, so installing a new server and starting a new queen
|
||||
# session refreshes the catalog.
|
||||
mcp_tool_catalog: dict[str, list[dict[str, Any]]] = {}
|
||||
tools_by_name = {t.name: t for t in queen_tools}
|
||||
for server_name, tool_names in mcp_server_tools_map.items():
|
||||
server_entries: list[dict[str, Any]] = []
|
||||
for tool_name in sorted(tool_names):
|
||||
tool = tools_by_name.get(tool_name)
|
||||
if tool is None:
|
||||
continue
|
||||
server_entries.append(
|
||||
{
|
||||
"name": tool.name,
|
||||
"description": tool.description,
|
||||
"input_schema": tool.parameters,
|
||||
}
|
||||
)
|
||||
mcp_tool_catalog[server_name] = server_entries
|
||||
# All queens share one MCP registry, so the catalog is a manager-level
|
||||
# fact; stash it on the SessionManager so the Queen Tools route can
|
||||
# render the tool list even when no queen session is currently live.
|
||||
if session_manager is not None:
|
||||
try:
|
||||
session_manager._mcp_tool_catalog = mcp_tool_catalog # type: ignore[attr-defined]
|
||||
except Exception:
|
||||
logger.debug("Queen: could not attach mcp_tool_catalog to manager", exc_info=True)
|
||||
|
||||
# ---- Global + queen-scoped memory ----------------------------------
|
||||
global_dir, queen_mem_dir = initialize_memory_scopes(session, phase_state)
|
||||
@@ -421,37 +646,56 @@ async def create_queen(
|
||||
(
|
||||
_queen_character_core
|
||||
+ _queen_role_independent
|
||||
+ _queen_style
|
||||
+ _queen_tools_independent
|
||||
+ _queen_behavior_always
|
||||
+ _queen_behavior_independent
|
||||
),
|
||||
_has_vision,
|
||||
)
|
||||
phase_state.prompt_incubating = finalize_queen_prompt(
|
||||
(_queen_character_core + _queen_role_incubating + _queen_tools_incubating + _queen_behavior_always),
|
||||
_has_vision,
|
||||
)
|
||||
phase_state.prompt_working = finalize_queen_prompt(
|
||||
(_queen_character_core + _queen_role_working + _queen_style + _queen_tools_working + _queen_behavior_always),
|
||||
(_queen_character_core + _queen_role_working + _queen_tools_working + _queen_behavior_always),
|
||||
_has_vision,
|
||||
)
|
||||
phase_state.prompt_reviewing = finalize_queen_prompt(
|
||||
(
|
||||
_queen_character_core
|
||||
+ _queen_role_reviewing
|
||||
+ _queen_style
|
||||
+ _queen_tools_reviewing
|
||||
+ _queen_behavior_always
|
||||
),
|
||||
(_queen_character_core + _queen_role_reviewing + _queen_tools_reviewing + _queen_behavior_always),
|
||||
_has_vision,
|
||||
)
|
||||
|
||||
# ---- Default skill protocols -------------------------------------
|
||||
_queen_skill_dirs: list[str] = []
|
||||
try:
|
||||
from framework.config import QUEENS_DIR
|
||||
from framework.skills.discovery import ExtraScope
|
||||
from framework.skills.manager import SkillsManager, SkillsManagerConfig
|
||||
|
||||
# Pass project_root so user-scope skills (~/.hive/skills/, ~/.agents/skills/)
|
||||
# are discovered. Queen has no agent-specific project root, so we use its
|
||||
# own directory — the value just needs to be non-None to enable user-scope scanning.
|
||||
_queen_skills_mgr = SkillsManager(SkillsManagerConfig(project_root=Path(__file__).parent))
|
||||
# Queen home backs the queen-UI skill scope and the queen's
|
||||
# override store. The directory already exists (or is created on
|
||||
# demand by queen_profiles.py); treat a missing queen_name as the
|
||||
# default queen to preserve backwards compatibility.
|
||||
_queen_id = getattr(session, "queen_name", None) or "default"
|
||||
_queen_home = QUEENS_DIR / _queen_id
|
||||
_queen_skills_mgr = SkillsManager(
|
||||
SkillsManagerConfig(
|
||||
queen_id=_queen_id,
|
||||
queen_overrides_path=_queen_home / "skills_overrides.json",
|
||||
extra_scope_dirs=[
|
||||
ExtraScope(
|
||||
directory=_queen_home / "skills",
|
||||
label="queen_ui",
|
||||
priority=2,
|
||||
)
|
||||
],
|
||||
# No project_root — queen's project is her own identity;
|
||||
# user-scope discovery still runs without one.
|
||||
project_root=None,
|
||||
skip_community_discovery=True,
|
||||
interactive=False,
|
||||
)
|
||||
)
|
||||
_queen_skills_mgr.load()
|
||||
phase_state.protocols_prompt = _queen_skills_mgr.protocols_prompt
|
||||
phase_state.skills_catalog_prompt = _queen_skills_mgr.skills_catalog_prompt
|
||||
@@ -490,8 +734,37 @@ async def create_queen(
|
||||
|
||||
# ---- Recall on each real user turn --------------------------------
|
||||
async def _recall_on_user_input(event: AgentEvent) -> None:
|
||||
"""Re-select memories when real user input arrives."""
|
||||
await _refresh_recall_cache((event.data or {}).get("content", ""))
|
||||
"""On real user input, freeze the dynamic system-prompt suffix and
|
||||
refresh recall memories in the background.
|
||||
|
||||
The EventBus drops handlers that exceed 15s, so we MUST return fast.
|
||||
Recall selection queries the LLM and can take >15s on slow backends;
|
||||
we fire it off as a background task and re-stamp the suffix when it
|
||||
completes. The immediate refresh_dynamic_suffix call stamps a fresh
|
||||
timestamp using the last-known recall blocks so every iteration of
|
||||
THIS user turn sees a byte-stable prompt (prompt cache hits on the
|
||||
static block). Phase-change injections and worker-report injections
|
||||
go through agent_loop.inject_event() and do NOT publish
|
||||
CLIENT_INPUT_RECEIVED, so this runs exactly once per real user turn.
|
||||
"""
|
||||
query = (event.data or {}).get("content", "")
|
||||
# Immediate: stamp "now" into the frozen suffix, using whatever
|
||||
# recall blocks we already cached (from the prior turn or seeding).
|
||||
phase_state.refresh_dynamic_suffix()
|
||||
|
||||
async def _bg_refresh() -> None:
|
||||
try:
|
||||
await _refresh_recall_cache(query)
|
||||
# Re-stamp with the fresh recall blocks. Any iteration that
|
||||
# read the suffix before this point used the older recall
|
||||
# — acceptable; recall was already eventual-consistency.
|
||||
phase_state.refresh_dynamic_suffix()
|
||||
except Exception:
|
||||
logger.debug("background recall refresh failed", exc_info=True)
|
||||
|
||||
import asyncio as _asyncio
|
||||
|
||||
_asyncio.create_task(_bg_refresh())
|
||||
|
||||
session.event_bus.subscribe(
|
||||
[EventType.CLIENT_INPUT_RECEIVED],
|
||||
@@ -601,6 +874,9 @@ async def create_queen(
|
||||
except Exception:
|
||||
logger.debug("recall: initial seeding failed", exc_info=True)
|
||||
|
||||
# Freeze the dynamic suffix once so the first real turn sends a
|
||||
# byte-stable prompt even before CLIENT_INPUT_RECEIVED fires.
|
||||
phase_state.refresh_dynamic_suffix()
|
||||
return HookResult(system_prompt=phase_state.get_current_prompt())
|
||||
|
||||
# ---- Colony preparation -------------------------------------------
|
||||
@@ -645,10 +921,21 @@ async def create_queen(
|
||||
# token stays local to this task.
|
||||
try:
|
||||
from framework.loader.tool_registry import ToolRegistry
|
||||
from framework.tasks.scoping import session_task_list_id
|
||||
|
||||
ToolRegistry.set_execution_context(profile=session.id)
|
||||
queen_agent_id = getattr(session, "agent_id", None) or "queen"
|
||||
queen_list_id = session_task_list_id(queen_agent_id, session.id)
|
||||
colony_id = getattr(session, "colony_id", None) or getattr(
|
||||
getattr(session, "colony_runtime", None), "_colony_id", None
|
||||
)
|
||||
ToolRegistry.set_execution_context(
|
||||
profile=session.id,
|
||||
agent_id=queen_agent_id,
|
||||
task_list_id=queen_list_id,
|
||||
colony_id=colony_id,
|
||||
)
|
||||
except Exception:
|
||||
logger.debug("Queen: failed to set browser profile for session %s", session.id, exc_info=True)
|
||||
logger.debug("Queen: failed to set execution context for session %s", session.id, exc_info=True)
|
||||
try:
|
||||
lc = _queen_loop_config
|
||||
queen_loop_config = LoopConfig(
|
||||
@@ -696,11 +983,17 @@ async def create_queen(
|
||||
llm=session.llm,
|
||||
available_tools=queen_tools,
|
||||
goal_context=queen_goal.to_prompt_context(),
|
||||
max_tokens=lc.get("max_tokens", 8192),
|
||||
# Honor configuration.json (llm.max_tokens) instead of
|
||||
# hard-defaulting to 8192. The legacy fallback ignored both
|
||||
# the user's saved ceiling AND the model's actual output
|
||||
# capacity (e.g. glm-5.1 / kimi-k2.5 both support 32k out),
|
||||
# which silently truncated long tool-emitting turns.
|
||||
max_tokens=lc.get("max_tokens", _get_max_tokens()),
|
||||
stream_id="queen",
|
||||
execution_id=session.id,
|
||||
dynamic_tools_provider=phase_state.get_current_tools,
|
||||
dynamic_prompt_provider=phase_state.get_current_prompt,
|
||||
dynamic_prompt_provider=phase_state.get_static_prompt,
|
||||
dynamic_prompt_suffix_provider=phase_state.get_dynamic_suffix,
|
||||
iteration_metadata_provider=lambda: {"phase": phase_state.phase},
|
||||
skills_catalog_prompt=phase_state.skills_catalog_prompt,
|
||||
protocols_prompt=phase_state.protocols_prompt,
|
||||
|
||||
@@ -0,0 +1,505 @@
|
||||
"""HTTP routes for colony import/export — moving a colony spec between hosts.
|
||||
|
||||
Today, just the import side: accept a `tar.gz` and unpack it into HIVE_HOME so
|
||||
a desktop client (or any external mover) can hand a colony to a remote runtime
|
||||
to run.
|
||||
|
||||
POST /api/colonies/import -- multipart/form-data
|
||||
file required -- .tar / .tar.gz / .tar.bz2 / .tar.xz
|
||||
name optional -- override the colony name (legacy single-root
|
||||
archives only); defaults to the archive's
|
||||
single top-level directory
|
||||
replace_existing optional -- "true" to overwrite, else 409 on conflict
|
||||
|
||||
The desktop sends a *multi-root* tar so the queen sees a colony's full state
|
||||
(not just metadata + data) on resume. Recognised top-level prefixes:
|
||||
|
||||
colonies/<name>/... → HIVE_HOME/colonies/<name>/...
|
||||
agents/<name>/worker/... → HIVE_HOME/agents/<name>/worker/...
|
||||
agents/queens/<queen>/sessions/<sid>/... → HIVE_HOME/agents/queens/<queen>/sessions/<sid>/...
|
||||
|
||||
Anything outside those is rejected. For backwards compat with older clients
|
||||
that tar `<name>/...` directly (single colony dir, no `colonies/` wrapper),
|
||||
the handler falls back to the legacy single-root flow when no recognised
|
||||
multi-root prefix is found.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import shutil
|
||||
import tarfile
|
||||
from pathlib import Path
|
||||
|
||||
from aiohttp import web
|
||||
|
||||
from framework.config import COLONIES_DIR
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Matches the convention used elsewhere in the codebase (see
|
||||
# routes_colony_workers and queen_lifecycle_tools): lowercase alphanumerics
|
||||
# and underscores only. No dots, no slashes — names are filesystem segments.
|
||||
_COLONY_NAME_RE = re.compile(r"^[a-z0-9_]+$")
|
||||
|
||||
# Conservative segment validator for the queen's session id (date-stamped UUID
|
||||
# tail like ``session_20260415_175106_eca07a69``) and queen name slug
|
||||
# (``queen_technology``). Same charset as colony names — the codebase already
|
||||
# normalises both to ``[a-z0-9_]+`` everywhere they're created, so accepting
|
||||
# a wider charset here would just introduce a foothold for path mischief.
|
||||
_SESSION_SEGMENT_RE = re.compile(r"^[a-z0-9_]+$")
|
||||
|
||||
# 100 MB cap on upload size. The multi-root tar carries worker conversations
|
||||
# (often 100s of small JSON parts) plus the queen's forked session, so the
|
||||
# legacy 50 MB ceiling is too tight. Anything bigger probably shouldn't be
|
||||
# pushed wholesale anyway.
|
||||
_MAX_UPLOAD_BYTES = 100 * 1024 * 1024
|
||||
|
||||
|
||||
def _agents_dir() -> Path:
|
||||
"""``COLONIES_DIR`` resolves to ``HIVE_HOME/colonies``; ``agents/`` is
|
||||
the sibling. Resolved per-call so tests that monkeypatch
|
||||
``COLONIES_DIR`` propagate without a second patch."""
|
||||
return Path(COLONIES_DIR).parent / "agents"
|
||||
|
||||
|
||||
def _validate_colony_name(name: str) -> str | None:
|
||||
"""Return an error message if name isn't a valid colony name, else None."""
|
||||
if not name:
|
||||
return "colony name is required"
|
||||
if len(name) > 64:
|
||||
return "colony name too long (max 64 chars)"
|
||||
if not _COLONY_NAME_RE.match(name):
|
||||
return "colony name must match [a-z0-9_]+"
|
||||
return None
|
||||
|
||||
|
||||
def _validate_session_segment(seg: str, label: str) -> str | None:
|
||||
"""Validate a path segment we're going to plumb into a destination dir."""
|
||||
if not seg:
|
||||
return f"{label} is required"
|
||||
if len(seg) > 128:
|
||||
return f"{label} too long (max 128 chars)"
|
||||
if not _SESSION_SEGMENT_RE.match(seg):
|
||||
return f"{label} must match [a-zA-Z0-9_-]+"
|
||||
return None
|
||||
|
||||
|
||||
def _archive_top_level(tf: tarfile.TarFile) -> tuple[str | None, str | None]:
|
||||
"""Find the archive's single top-level directory, if it has one.
|
||||
|
||||
Used only for the legacy single-root path. Returns ``(name, error)``.
|
||||
Allows the archive to optionally include a leading ``./`` prefix.
|
||||
"""
|
||||
tops: set[str] = set()
|
||||
for member in tf.getmembers():
|
||||
if not member.name or member.name.startswith("/"):
|
||||
return None, f"invalid member path: {member.name!r}"
|
||||
parts = Path(member.name).parts
|
||||
if not parts or parts[0] == "..":
|
||||
return None, f"invalid member path: {member.name!r}"
|
||||
first = parts[0] if parts[0] != "." else (parts[1] if len(parts) > 1 else "")
|
||||
if first:
|
||||
tops.add(first)
|
||||
if len(tops) != 1:
|
||||
return None, "archive must contain exactly one top-level directory"
|
||||
return next(iter(tops)), None
|
||||
|
||||
|
||||
def _has_multi_root_prefix(tf: tarfile.TarFile) -> bool:
|
||||
"""True iff any member name starts with a recognised multi-root prefix.
|
||||
|
||||
The legacy shape (`<name>/...`) doesn't match either prefix, so this lets
|
||||
us route old and new clients through the same endpoint.
|
||||
"""
|
||||
for member in tf.getmembers():
|
||||
name = member.name
|
||||
if name.startswith("./"):
|
||||
name = name[2:]
|
||||
if name.startswith("colonies/") or name.startswith("agents/"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _normalise_member_name(name: str) -> str:
|
||||
"""Strip a leading ``./`` if present; reject absolute or empty names."""
|
||||
if name.startswith("./"):
|
||||
name = name[2:]
|
||||
return name
|
||||
|
||||
|
||||
def _safe_extract_tar(tf: tarfile.TarFile, dest: Path, *, strip_prefix: str) -> tuple[int, str | None]:
|
||||
"""Extract every member of ``tf`` whose name starts with ``strip_prefix/``
|
||||
into ``dest``, with the prefix stripped off.
|
||||
|
||||
Each member's resolved path must stay under ``dest``; symlinks, hardlinks,
|
||||
and device/fifo entries are rejected. Returns ``(files_extracted, error)``;
|
||||
on error the caller is responsible for cleanup.
|
||||
|
||||
Members outside ``strip_prefix`` are silently *skipped* (not an error) so
|
||||
the caller can call this multiple times on the same tar with different
|
||||
prefixes — once per recognised root.
|
||||
"""
|
||||
base = dest.resolve()
|
||||
base.mkdir(parents=True, exist_ok=True)
|
||||
files_extracted = 0
|
||||
prefix_with_sep = f"{strip_prefix}/" if strip_prefix else ""
|
||||
|
||||
for member in tf.getmembers():
|
||||
name = _normalise_member_name(member.name)
|
||||
if not name:
|
||||
continue
|
||||
if strip_prefix:
|
||||
if name == strip_prefix:
|
||||
# The top-level dir entry itself; dest already exists.
|
||||
continue
|
||||
if not name.startswith(prefix_with_sep):
|
||||
# Belongs to a different root in a multi-root tar; skip.
|
||||
continue
|
||||
rel = name[len(prefix_with_sep) :]
|
||||
else:
|
||||
rel = name
|
||||
if not rel:
|
||||
continue
|
||||
if ".." in Path(rel).parts:
|
||||
return files_extracted, f"path traversal in member: {member.name!r}"
|
||||
if member.issym() or member.islnk():
|
||||
return (
|
||||
files_extracted,
|
||||
f"symlinks/hardlinks not supported: {member.name!r}",
|
||||
)
|
||||
if member.isdev() or member.isfifo():
|
||||
return (
|
||||
files_extracted,
|
||||
f"device/fifo not supported: {member.name!r}",
|
||||
)
|
||||
|
||||
target = (base / rel).resolve()
|
||||
try:
|
||||
target.relative_to(base)
|
||||
except ValueError:
|
||||
return files_extracted, f"member escapes destination: {member.name!r}"
|
||||
|
||||
if member.isdir():
|
||||
target.mkdir(parents=True, exist_ok=True)
|
||||
continue
|
||||
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
src = tf.extractfile(member)
|
||||
if src is None:
|
||||
return files_extracted, f"unsupported member: {member.name!r}"
|
||||
with target.open("wb") as out:
|
||||
shutil.copyfileobj(src, out)
|
||||
target.chmod(member.mode & 0o755 if member.mode else 0o644)
|
||||
files_extracted += 1
|
||||
|
||||
return files_extracted, None
|
||||
|
||||
|
||||
def _classify_multi_root_member(name: str) -> tuple[str, str] | None:
|
||||
"""Recognise a multi-root tar member and return ``(root, top_dir)``.
|
||||
|
||||
``root`` is one of ``"colonies"``, ``"agents_worker"``, ``"agents_queen"``;
|
||||
``top_dir`` is the prefix to feed to ``_safe_extract_tar`` (the part of
|
||||
the path that should be stripped before joining with the destination
|
||||
base). Returns None for members that don't match any recognised root.
|
||||
|
||||
The caller pre-validates segments before extraction, so this is purely
|
||||
structural: which root, what the strip prefix should be.
|
||||
"""
|
||||
parts = Path(name).parts
|
||||
if not parts:
|
||||
return None
|
||||
if parts[0] == "colonies" and len(parts) >= 2:
|
||||
return ("colonies", f"colonies/{parts[1]}")
|
||||
if parts[0] == "agents" and len(parts) >= 2:
|
||||
# agents/queens/<queen>/sessions/<sid>/... vs agents/<name>/worker/...
|
||||
if parts[1] == "queens":
|
||||
if len(parts) >= 5 and parts[3] == "sessions":
|
||||
return ("agents_queen", f"agents/queens/{parts[2]}/sessions/{parts[4]}")
|
||||
return None
|
||||
# Plain agent — only the worker subtree is exported.
|
||||
if len(parts) >= 3 and parts[2] == "worker":
|
||||
return ("agents_worker", f"agents/{parts[1]}/worker")
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _plan_multi_root(
|
||||
tf: tarfile.TarFile,
|
||||
) -> tuple[dict[str, dict[str, str]], str | None]:
|
||||
"""Walk the tar once and group entries by root.
|
||||
|
||||
Returns ``(groups, error)`` where ``groups`` is keyed by root kind
|
||||
(``"colonies"`` etc.) and each entry maps the strip prefix to its
|
||||
destination directory under HIVE_HOME. Validates name segments so we
|
||||
bail before unpacking when something looks off.
|
||||
"""
|
||||
groups: dict[str, dict[str, str]] = {
|
||||
"colonies": {},
|
||||
"agents_worker": {},
|
||||
"agents_queen": {},
|
||||
}
|
||||
seen_unrecognised: set[str] = set()
|
||||
for member in tf.getmembers():
|
||||
name = _normalise_member_name(member.name)
|
||||
if not name or name.startswith("/") or ".." in Path(name).parts:
|
||||
return groups, f"invalid member path: {member.name!r}"
|
||||
classified = _classify_multi_root_member(name)
|
||||
if classified is None:
|
||||
# Track unique top-level dirs to give a useful error if nothing
|
||||
# ended up classified.
|
||||
seen_unrecognised.add(Path(name).parts[0])
|
||||
continue
|
||||
kind, prefix = classified
|
||||
if prefix in groups[kind]:
|
||||
continue
|
||||
# Validate path segments per-kind so we never plumb dirty input into
|
||||
# a destination we don't fully control.
|
||||
prefix_parts = Path(prefix).parts
|
||||
if kind == "colonies":
|
||||
err = _validate_colony_name(prefix_parts[1])
|
||||
if err:
|
||||
return groups, err
|
||||
dest = str(COLONIES_DIR / prefix_parts[1])
|
||||
elif kind == "agents_worker":
|
||||
err = _validate_colony_name(prefix_parts[1])
|
||||
if err:
|
||||
return groups, err
|
||||
dest = str(_agents_dir() / prefix_parts[1] / "worker")
|
||||
elif kind == "agents_queen":
|
||||
queen, sid = prefix_parts[2], prefix_parts[4]
|
||||
err = _validate_session_segment(queen, "queen name")
|
||||
if err:
|
||||
return groups, err
|
||||
err = _validate_session_segment(sid, "queen session id")
|
||||
if err:
|
||||
return groups, err
|
||||
dest = str(_agents_dir() / "queens" / queen / "sessions" / sid)
|
||||
else: # pragma: no cover — defensive
|
||||
continue
|
||||
groups[kind][prefix] = dest
|
||||
|
||||
if not any(groups.values()):
|
||||
roots = ", ".join(sorted(seen_unrecognised)) or "(none)"
|
||||
return (
|
||||
groups,
|
||||
"tar has no recognised top-level prefix "
|
||||
f"(expected colonies/, agents/<name>/worker/, "
|
||||
f"agents/queens/<queen>/sessions/<sid>/; got: {roots})",
|
||||
)
|
||||
return groups, None
|
||||
|
||||
|
||||
async def _read_upload(
|
||||
request: web.Request,
|
||||
) -> tuple[bytes | None, str | None, dict[str, str], web.Response | None]:
|
||||
"""Drain the multipart upload. Returns ``(bytes, filename, form, error)``."""
|
||||
if not request.content_type.startswith("multipart/"):
|
||||
return None, None, {}, web.json_response({"error": "expected multipart/form-data"}, status=400)
|
||||
reader = await request.multipart()
|
||||
upload: bytes | None = None
|
||||
upload_filename: str | None = None
|
||||
form: dict[str, str] = {}
|
||||
while True:
|
||||
part = await reader.next()
|
||||
if part is None:
|
||||
break
|
||||
if part.name == "file":
|
||||
buf = io.BytesIO()
|
||||
while True:
|
||||
chunk = await part.read_chunk(size=65536)
|
||||
if not chunk:
|
||||
break
|
||||
buf.write(chunk)
|
||||
if buf.tell() > _MAX_UPLOAD_BYTES:
|
||||
return (
|
||||
None,
|
||||
None,
|
||||
{},
|
||||
web.json_response(
|
||||
{"error": f"upload exceeds {_MAX_UPLOAD_BYTES} bytes"},
|
||||
status=413,
|
||||
),
|
||||
)
|
||||
upload = buf.getvalue()
|
||||
upload_filename = part.filename or ""
|
||||
else:
|
||||
form[part.name or ""] = (await part.text()).strip()
|
||||
if upload is None:
|
||||
return None, None, {}, web.json_response({"error": "missing 'file' part"}, status=400)
|
||||
return upload, upload_filename, form, None
|
||||
|
||||
|
||||
async def handle_import_colony(request: web.Request) -> web.Response:
|
||||
"""POST /api/colonies/import — unpack a colony tarball into HIVE_HOME."""
|
||||
upload, upload_filename, form, err_resp = await _read_upload(request)
|
||||
if err_resp is not None:
|
||||
return err_resp
|
||||
assert upload is not None # for the type checker
|
||||
|
||||
replace_existing = form.get("replace_existing", "false").lower() == "true"
|
||||
name_override = form.get("name", "").strip() or None
|
||||
|
||||
try:
|
||||
tf = tarfile.open(fileobj=io.BytesIO(upload), mode="r:*")
|
||||
except tarfile.TarError as err:
|
||||
return web.json_response({"error": f"invalid tar archive: {err}"}, status=400)
|
||||
|
||||
try:
|
||||
if _has_multi_root_prefix(tf):
|
||||
return await _import_multi_root(tf, replace_existing, upload_filename, len(upload))
|
||||
return await _import_legacy_single_root(tf, name_override, replace_existing, upload_filename, len(upload))
|
||||
finally:
|
||||
tf.close()
|
||||
|
||||
|
||||
async def _import_legacy_single_root(
|
||||
tf: tarfile.TarFile,
|
||||
name_override: str | None,
|
||||
replace_existing: bool,
|
||||
upload_filename: str | None,
|
||||
upload_size: int,
|
||||
) -> web.Response:
|
||||
"""Legacy path: tar contains `<name>/...` only, route to colonies/<name>/.
|
||||
|
||||
Kept verbatim from the previous handler so existing test fixtures and
|
||||
older desktop builds keep working during a partial rollout.
|
||||
"""
|
||||
top, top_err = _archive_top_level(tf)
|
||||
if top_err or top is None:
|
||||
return web.json_response({"error": top_err}, status=400)
|
||||
|
||||
colony_name = name_override or top
|
||||
name_err = _validate_colony_name(colony_name)
|
||||
if name_err:
|
||||
return web.json_response({"error": name_err}, status=400)
|
||||
|
||||
target = COLONIES_DIR / colony_name
|
||||
if target.exists():
|
||||
if not replace_existing:
|
||||
return web.json_response(
|
||||
{
|
||||
"error": "colony already exists",
|
||||
"name": colony_name,
|
||||
"hint": "set replace_existing=true to overwrite",
|
||||
},
|
||||
status=409,
|
||||
)
|
||||
shutil.rmtree(target)
|
||||
|
||||
files_extracted, extract_err = _safe_extract_tar(tf, target, strip_prefix=top)
|
||||
if extract_err:
|
||||
shutil.rmtree(target, ignore_errors=True)
|
||||
return web.json_response({"error": extract_err}, status=400)
|
||||
|
||||
logger.info(
|
||||
"Imported colony %s (legacy, %d files) from upload %s (%d bytes)",
|
||||
colony_name,
|
||||
files_extracted,
|
||||
upload_filename or "<unnamed>",
|
||||
upload_size,
|
||||
)
|
||||
return web.json_response(
|
||||
{
|
||||
"name": colony_name,
|
||||
"path": str(target),
|
||||
"files_imported": files_extracted,
|
||||
"replaced": replace_existing,
|
||||
},
|
||||
status=201,
|
||||
)
|
||||
|
||||
|
||||
async def _import_multi_root(
|
||||
tf: tarfile.TarFile,
|
||||
replace_existing: bool,
|
||||
upload_filename: str | None,
|
||||
upload_size: int,
|
||||
) -> web.Response:
|
||||
"""New path: tar contains `colonies/<name>/...` plus optional agents trees.
|
||||
|
||||
Each recognised root is extracted to its corresponding HIVE_HOME subtree
|
||||
using the same traversal-safe walker as the legacy path. ``replace_existing``
|
||||
governs the colonies dir conflict; the agents trees overwrite in place
|
||||
(worker conversations and queen sessions are append-mostly stores —
|
||||
overwriting a stale subset is fine, and adding the conflict gate would
|
||||
block legitimate re-pushes from a different desktop session).
|
||||
"""
|
||||
plan, plan_err = _plan_multi_root(tf)
|
||||
if plan_err:
|
||||
return web.json_response({"error": plan_err}, status=400)
|
||||
|
||||
# Conflict guard for the colonies root only — these are user-visible
|
||||
# entities the desktop expects to control overwrite of.
|
||||
primary_colony_name: str | None = None
|
||||
primary_colony_target: Path | None = None
|
||||
for prefix, dest in plan["colonies"].items():
|
||||
target = Path(dest)
|
||||
primary_colony_name = Path(prefix).parts[1]
|
||||
primary_colony_target = target
|
||||
if target.exists() and not replace_existing:
|
||||
return web.json_response(
|
||||
{
|
||||
"error": "colony already exists",
|
||||
"name": primary_colony_name,
|
||||
"hint": "set replace_existing=true to overwrite",
|
||||
},
|
||||
status=409,
|
||||
)
|
||||
if target.exists() and replace_existing:
|
||||
shutil.rmtree(target)
|
||||
|
||||
# The colonies/ root is required. agents/ trees are optional follow-ons.
|
||||
if not plan["colonies"]:
|
||||
return web.json_response(
|
||||
{
|
||||
"error": "tar missing required colonies/<name>/ root",
|
||||
},
|
||||
status=400,
|
||||
)
|
||||
|
||||
summary: dict[str, dict[str, int | str]] = {}
|
||||
extracted_dests: list[Path] = []
|
||||
|
||||
def _abort(err: str, status: int = 400) -> web.Response:
|
||||
for path in extracted_dests:
|
||||
shutil.rmtree(path, ignore_errors=True)
|
||||
return web.json_response({"error": err}, status=status)
|
||||
|
||||
for kind in ("colonies", "agents_worker", "agents_queen"):
|
||||
for prefix, dest in plan[kind].items():
|
||||
target = Path(dest)
|
||||
files_extracted, extract_err = _safe_extract_tar(tf, target, strip_prefix=prefix)
|
||||
if extract_err:
|
||||
return _abort(extract_err)
|
||||
summary.setdefault(kind, {"files": 0})
|
||||
summary[kind]["files"] = int(summary[kind].get("files", 0)) + files_extracted
|
||||
extracted_dests.append(target)
|
||||
|
||||
total_files = sum(int(v.get("files", 0)) for v in summary.values())
|
||||
logger.info(
|
||||
"Imported colony %s (%d files across %d roots) from upload %s (%d bytes)",
|
||||
primary_colony_name or "<unknown>",
|
||||
total_files,
|
||||
sum(1 for v in summary.values() if int(v.get("files", 0)) > 0),
|
||||
upload_filename or "<unnamed>",
|
||||
upload_size,
|
||||
)
|
||||
|
||||
return web.json_response(
|
||||
{
|
||||
"name": primary_colony_name,
|
||||
"path": str(primary_colony_target) if primary_colony_target else None,
|
||||
"files_imported": total_files,
|
||||
"by_root": summary,
|
||||
"replaced": replace_existing,
|
||||
},
|
||||
status=201,
|
||||
)
|
||||
|
||||
|
||||
def register_routes(app: web.Application) -> None:
|
||||
app.router.add_post("/api/colonies/import", handle_import_colony)
|
||||
@@ -0,0 +1,329 @@
|
||||
"""Per-colony MCP tool allowlist routes.
|
||||
|
||||
- GET /api/colony/{colony_name}/tools -- enumerate colony tool surface
|
||||
- PATCH /api/colony/{colony_name}/tools -- set or clear the allowlist
|
||||
|
||||
A colony's tool set is inherited from the queen that forked it, so the
|
||||
tool surface mirrors the queen's MCP servers. Lifecycle/synthetic tools
|
||||
are included for display only. MCP tools are grouped by origin server
|
||||
with per-tool ``enabled`` flags.
|
||||
|
||||
Semantics:
|
||||
|
||||
- ``enabled_mcp_tools: null`` → allow every MCP tool (default).
|
||||
- ``enabled_mcp_tools: []`` → allow no MCP tools (only lifecycle /
|
||||
synthetic pass through).
|
||||
- ``enabled_mcp_tools: [...]`` → only listed names pass.
|
||||
|
||||
The allowlist is persisted in a dedicated ``tools.json`` sidecar at
|
||||
``~/.hive/colonies/{colony_name}/tools.json``. Changes take effect on
|
||||
the *next* worker spawn. In-flight workers keep the tool list they
|
||||
booted with because workers have no dynamic tools provider today —
|
||||
mutating their tool set mid-turn would diverge from the list the LLM
|
||||
is already using.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from aiohttp import web
|
||||
|
||||
from framework.host.colony_metadata import colony_metadata_path
|
||||
from framework.host.colony_tools_config import (
|
||||
load_colony_tools_config,
|
||||
update_colony_tools_config,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_SYNTHETIC_NAMES = {"ask_user"}
|
||||
|
||||
|
||||
def _synthetic_entries() -> list[dict[str, Any]]:
|
||||
try:
|
||||
from framework.agent_loop.internals.synthetic_tools import build_ask_user_tool
|
||||
|
||||
tool = build_ask_user_tool()
|
||||
return [
|
||||
{
|
||||
"name": tool.name,
|
||||
"description": tool.description,
|
||||
"editable": False,
|
||||
}
|
||||
]
|
||||
except Exception:
|
||||
return [
|
||||
{
|
||||
"name": "ask_user",
|
||||
"description": "Pause and ask the user a structured question.",
|
||||
"editable": False,
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _colony_runtimes_for_name(manager: Any, colony_name: str) -> list[Any]:
|
||||
"""Return every live ColonyRuntime whose session is attached to ``colony_name``."""
|
||||
sessions = getattr(manager, "_sessions", None) or {}
|
||||
runtimes: list[Any] = []
|
||||
for session in sessions.values():
|
||||
if getattr(session, "colony_name", None) != colony_name:
|
||||
continue
|
||||
# Both ``session.colony`` (queen-side unified runtime) and
|
||||
# ``session.colony_runtime`` (legacy worker runtime) may carry
|
||||
# tools that need the allowlist applied. We update both.
|
||||
for attr in ("colony", "colony_runtime"):
|
||||
rt = getattr(session, attr, None)
|
||||
if rt is not None and rt not in runtimes:
|
||||
runtimes.append(rt)
|
||||
return runtimes
|
||||
|
||||
|
||||
async def _render_catalog(manager: Any, colony_name: str) -> dict[str, list[dict[str, Any]]]:
|
||||
"""Build a per-server tool catalog for this colony.
|
||||
|
||||
All colonies inherit the queen's MCP surface, so we reuse the
|
||||
manager-level ``_mcp_tool_catalog`` populated during queen boot.
|
||||
"""
|
||||
# If a live runtime exists and carries its own registry, prefer it —
|
||||
# it's authoritative (reflects any post-queen-boot MCP additions).
|
||||
for rt in _colony_runtimes_for_name(manager, colony_name):
|
||||
tools = getattr(rt, "_tools", None)
|
||||
if not tools:
|
||||
continue
|
||||
mcp_names = set(getattr(rt, "_mcp_tool_names_all", set()) or set())
|
||||
if not mcp_names:
|
||||
continue
|
||||
catalog: dict[str, list[dict[str, Any]]] = {"(mcp)": []}
|
||||
for tool in tools:
|
||||
name = getattr(tool, "name", None)
|
||||
if name in mcp_names:
|
||||
catalog["(mcp)"].append(
|
||||
{
|
||||
"name": name,
|
||||
"description": getattr(tool, "description", ""),
|
||||
"input_schema": getattr(tool, "parameters", {}),
|
||||
}
|
||||
)
|
||||
return catalog
|
||||
|
||||
# Otherwise fall back to the queen-level snapshot. Build it on demand
|
||||
# (off the event loop) when empty so the Tool Library works before
|
||||
# any queen has been started in this process.
|
||||
cached = getattr(manager, "_mcp_tool_catalog", None)
|
||||
if isinstance(cached, dict) and cached:
|
||||
return cached
|
||||
try:
|
||||
import asyncio
|
||||
|
||||
from framework.server.queen_orchestrator import build_queen_tool_registry_bare
|
||||
|
||||
registry, built = await asyncio.to_thread(build_queen_tool_registry_bare)
|
||||
if manager is not None:
|
||||
manager._mcp_tool_catalog = built # type: ignore[attr-defined]
|
||||
manager._bootstrap_tool_registry = registry # type: ignore[attr-defined]
|
||||
return built
|
||||
except Exception:
|
||||
logger.warning("Colony tools: catalog bootstrap failed", exc_info=True)
|
||||
return {}
|
||||
|
||||
|
||||
def _lifecycle_entries_from_runtime(manager: Any, colony_name: str) -> list[dict[str, Any]]:
|
||||
"""Non-MCP tools currently registered on the colony runtime (if any).
|
||||
|
||||
When no live runtime is available we fall back to the bootstrap
|
||||
registry stashed on the manager by ``routes_queen_tools`` — it
|
||||
already has queen lifecycle tools registered, which are also the
|
||||
lifecycle tools colonies inherit at spawn time.
|
||||
"""
|
||||
out: list[dict[str, Any]] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
def _push(name: str, description: str) -> None:
|
||||
if not name or name in seen:
|
||||
return
|
||||
if name in _SYNTHETIC_NAMES:
|
||||
return
|
||||
seen.add(name)
|
||||
out.append({"name": name, "description": description, "editable": False})
|
||||
|
||||
runtimes = _colony_runtimes_for_name(manager, colony_name)
|
||||
if runtimes:
|
||||
for rt in runtimes:
|
||||
mcp_names = set(getattr(rt, "_mcp_tool_names_all", set()) or set())
|
||||
for tool in getattr(rt, "_tools", []) or []:
|
||||
name = getattr(tool, "name", None)
|
||||
if name in mcp_names:
|
||||
continue
|
||||
_push(name, getattr(tool, "description", ""))
|
||||
else:
|
||||
# No live runtime — derive from the bootstrap registry.
|
||||
from framework.server.routes_queen_tools import _lifecycle_entries_without_session
|
||||
|
||||
catalog = getattr(manager, "_mcp_tool_catalog", {}) or {}
|
||||
mcp_names: set[str] = set()
|
||||
for entries in catalog.values():
|
||||
for entry in entries:
|
||||
if entry.get("name"):
|
||||
mcp_names.add(entry["name"])
|
||||
out.extend(_lifecycle_entries_without_session(manager, mcp_names))
|
||||
return out
|
||||
return sorted(out, key=lambda e: e["name"])
|
||||
|
||||
|
||||
def _render_servers(
|
||||
catalog: dict[str, list[dict[str, Any]]],
|
||||
enabled_mcp_tools: list[str] | None,
|
||||
) -> list[dict[str, Any]]:
|
||||
allowed: set[str] | None = None if enabled_mcp_tools is None else set(enabled_mcp_tools)
|
||||
servers: list[dict[str, Any]] = []
|
||||
for name in sorted(catalog):
|
||||
tools = []
|
||||
for entry in catalog[name]:
|
||||
tool_name = entry.get("name")
|
||||
tools.append(
|
||||
{
|
||||
"name": tool_name,
|
||||
"description": entry.get("description", ""),
|
||||
"input_schema": entry.get("input_schema", {}),
|
||||
"enabled": True if allowed is None else tool_name in allowed,
|
||||
}
|
||||
)
|
||||
servers.append({"name": name, "tools": tools})
|
||||
return servers
|
||||
|
||||
|
||||
async def handle_get_tools(request: web.Request) -> web.Response:
|
||||
"""GET /api/colony/{colony_name}/tools."""
|
||||
colony_name = request.match_info["colony_name"]
|
||||
if not colony_metadata_path(colony_name).exists():
|
||||
return web.json_response({"error": f"Colony '{colony_name}' not found"}, status=404)
|
||||
|
||||
manager = request.app.get("manager")
|
||||
# Allowlist now lives in a dedicated tools.json sidecar; helper
|
||||
# migrates any legacy metadata.json field on first read.
|
||||
enabled = load_colony_tools_config(colony_name)
|
||||
|
||||
catalog = await _render_catalog(manager, colony_name)
|
||||
stale = not catalog
|
||||
|
||||
return web.json_response(
|
||||
{
|
||||
"colony_name": colony_name,
|
||||
"enabled_mcp_tools": enabled,
|
||||
"stale": stale,
|
||||
"lifecycle": _lifecycle_entries_from_runtime(manager, colony_name),
|
||||
"synthetic": _synthetic_entries(),
|
||||
"mcp_servers": _render_servers(catalog, enabled),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def handle_patch_tools(request: web.Request) -> web.Response:
|
||||
"""PATCH /api/colony/{colony_name}/tools."""
|
||||
colony_name = request.match_info["colony_name"]
|
||||
if not colony_metadata_path(colony_name).exists():
|
||||
return web.json_response({"error": f"Colony '{colony_name}' not found"}, status=404)
|
||||
|
||||
try:
|
||||
body = await request.json()
|
||||
except Exception:
|
||||
return web.json_response({"error": "Invalid JSON body"}, status=400)
|
||||
if not isinstance(body, dict) or "enabled_mcp_tools" not in body:
|
||||
return web.json_response(
|
||||
{"error": "Body must be an object with an 'enabled_mcp_tools' field"},
|
||||
status=400,
|
||||
)
|
||||
|
||||
enabled = body["enabled_mcp_tools"]
|
||||
if enabled is not None:
|
||||
if not isinstance(enabled, list) or not all(isinstance(x, str) for x in enabled):
|
||||
return web.json_response(
|
||||
{"error": "'enabled_mcp_tools' must be null or a list of strings"},
|
||||
status=400,
|
||||
)
|
||||
|
||||
manager = request.app.get("manager")
|
||||
|
||||
# Validate names against the known MCP catalog — lifts the same
|
||||
# typo-catching guarantee we already offer on queen tools.
|
||||
catalog = await _render_catalog(manager, colony_name)
|
||||
known: set[str] = {e.get("name") for entries in catalog.values() for e in entries if e.get("name")}
|
||||
if enabled is not None and known:
|
||||
unknown = sorted(set(enabled) - known)
|
||||
if unknown:
|
||||
return web.json_response(
|
||||
{"error": "Unknown MCP tool name(s)", "unknown": unknown},
|
||||
status=400,
|
||||
)
|
||||
|
||||
# Persist — tools.json sidecar, not metadata.json. Missing directory
|
||||
# is already guarded by the 404 check above.
|
||||
try:
|
||||
update_colony_tools_config(colony_name, enabled)
|
||||
except FileNotFoundError:
|
||||
return web.json_response({"error": f"Colony '{colony_name}' not found"}, status=404)
|
||||
|
||||
# Update any live runtimes so the NEXT worker spawn reflects the change.
|
||||
# We do NOT rebuild in-flight workers' tool lists (see module docstring).
|
||||
refreshed = 0
|
||||
for rt in _colony_runtimes_for_name(manager, colony_name):
|
||||
setter = getattr(rt, "set_tool_allowlist", None)
|
||||
if callable(setter):
|
||||
try:
|
||||
setter(enabled)
|
||||
refreshed += 1
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"Colony tools: set_tool_allowlist failed on runtime for %s",
|
||||
colony_name,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Colony tools: colony=%s allowlist=%s refreshed_runtimes=%d",
|
||||
colony_name,
|
||||
"null" if enabled is None else f"{len(enabled)} tool(s)",
|
||||
refreshed,
|
||||
)
|
||||
return web.json_response(
|
||||
{
|
||||
"colony_name": colony_name,
|
||||
"enabled_mcp_tools": enabled,
|
||||
"refreshed_runtimes": refreshed,
|
||||
"note": "Changes apply to the next worker spawn. Running workers keep their booted tool list.",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def handle_list_colonies(request: web.Request) -> web.Response:
|
||||
"""GET /api/colonies — list colonies with their tool allowlist status.
|
||||
|
||||
Powers the Tool Library page's colony picker.
|
||||
"""
|
||||
from framework.host.colony_metadata import list_colony_names, load_colony_metadata
|
||||
|
||||
colonies: list[dict[str, Any]] = []
|
||||
for name in list_colony_names():
|
||||
meta = load_colony_metadata(name)
|
||||
# Provenance stays in metadata.json; allowlist lives in tools.json.
|
||||
allowlist = load_colony_tools_config(name)
|
||||
colonies.append(
|
||||
{
|
||||
"name": name,
|
||||
"queen_name": meta.get("queen_name"),
|
||||
"created_at": meta.get("created_at"),
|
||||
"has_allowlist": allowlist is not None,
|
||||
"enabled_count": len(allowlist) if isinstance(allowlist, list) else None,
|
||||
}
|
||||
)
|
||||
return web.json_response({"colonies": colonies})
|
||||
|
||||
|
||||
def register_routes(app: web.Application) -> None:
|
||||
"""Register per-colony tool routes."""
|
||||
app.router.add_get("/api/colonies/tools-index", handle_list_colonies)
|
||||
app.router.add_get("/api/colony/{colony_name}/tools", handle_get_tools)
|
||||
app.router.add_patch("/api/colony/{colony_name}/tools", handle_patch_tools)
|
||||
@@ -235,10 +235,6 @@ _SYSTEM_TOOLS: frozenset[str] = frozenset(
|
||||
{
|
||||
"get_account_info",
|
||||
"get_current_time",
|
||||
"bash_kill",
|
||||
"bash_output",
|
||||
"execute_command_tool",
|
||||
"example_tool",
|
||||
}
|
||||
)
|
||||
|
||||
@@ -294,7 +290,9 @@ def _resolve_progress_db_by_name(colony_name: str) -> Path | None:
|
||||
"""
|
||||
if not _COLONY_NAME_RE.match(colony_name):
|
||||
return None
|
||||
db_path = Path.home() / ".hive" / "colonies" / colony_name / "data" / "progress.db"
|
||||
from framework.config import COLONIES_DIR
|
||||
|
||||
db_path = COLONIES_DIR / colony_name / "data" / "progress.db"
|
||||
return db_path if db_path.exists() else None
|
||||
|
||||
|
||||
|
||||
@@ -51,6 +51,8 @@ PROVIDER_ENV_VARS: dict[str, str] = {
|
||||
"together": "TOGETHER_API_KEY",
|
||||
"together_ai": "TOGETHER_API_KEY",
|
||||
"deepseek": "DEEPSEEK_API_KEY",
|
||||
"kimi": "KIMI_API_KEY",
|
||||
"hive": "HIVE_API_KEY",
|
||||
}
|
||||
|
||||
_SUBSCRIPTION_DEFINITIONS: list[dict[str, str]] = [
|
||||
|
||||
@@ -7,7 +7,7 @@ import os
|
||||
from aiohttp import web
|
||||
from pydantic import SecretStr
|
||||
|
||||
from framework.credentials.models import CredentialKey, CredentialObject
|
||||
from framework.credentials.models import CredentialDecryptionError, CredentialKey, CredentialObject
|
||||
from framework.credentials.store import CredentialStore
|
||||
from framework.server.app import validate_agent_path
|
||||
|
||||
@@ -84,23 +84,52 @@ def _credential_to_dict(cred: CredentialObject) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def _is_available_for_specs(store: CredentialStore, credential_id: str) -> bool:
|
||||
"""Best-effort availability check for the repair UI.
|
||||
|
||||
The credential settings page must stay reachable even when an encrypted
|
||||
file was written with the wrong key or is otherwise unreadable.
|
||||
"""
|
||||
try:
|
||||
return store.is_available(credential_id)
|
||||
except CredentialDecryptionError as exc:
|
||||
logger.warning("Credential '%s' is unreadable; marking unavailable in specs: %s", credential_id, exc)
|
||||
return False
|
||||
|
||||
|
||||
async def handle_list_credentials(request: web.Request) -> web.Response:
|
||||
"""GET /api/credentials — list all credential metadata (no secrets)."""
|
||||
store = _get_store(request)
|
||||
cred_ids = store.list_credentials()
|
||||
credentials = []
|
||||
unreadable = []
|
||||
for cid in cred_ids:
|
||||
cred = store.get_credential(cid, refresh_if_needed=False)
|
||||
try:
|
||||
cred = store.get_credential(cid, refresh_if_needed=False)
|
||||
except CredentialDecryptionError as exc:
|
||||
logger.warning("Credential '%s' is unreadable while listing credentials: %s", cid, exc)
|
||||
unreadable.append(cid)
|
||||
continue
|
||||
if cred:
|
||||
credentials.append(_credential_to_dict(cred))
|
||||
return web.json_response({"credentials": credentials})
|
||||
return web.json_response({"credentials": credentials, "unreadable_credentials": unreadable})
|
||||
|
||||
|
||||
async def handle_get_credential(request: web.Request) -> web.Response:
|
||||
"""GET /api/credentials/{credential_id} — get single credential metadata."""
|
||||
credential_id = request.match_info["credential_id"]
|
||||
store = _get_store(request)
|
||||
cred = store.get_credential(credential_id, refresh_if_needed=False)
|
||||
try:
|
||||
cred = store.get_credential(credential_id, refresh_if_needed=False)
|
||||
except CredentialDecryptionError:
|
||||
return web.json_response(
|
||||
{
|
||||
"error": f"Credential '{credential_id}' could not be decrypted",
|
||||
"credential_id": credential_id,
|
||||
"recoverable": True,
|
||||
},
|
||||
status=409,
|
||||
)
|
||||
if cred is None:
|
||||
return web.json_response({"error": f"Credential '{credential_id}' not found"}, status=404)
|
||||
return web.json_response(_credential_to_dict(cred))
|
||||
@@ -393,7 +422,7 @@ async def handle_list_specs(request: web.Request) -> web.Response:
|
||||
if spec.aden_supported and not spec.direct_api_key_supported:
|
||||
available = len(accounts) > 0
|
||||
else:
|
||||
available = store.is_available(cred_id)
|
||||
available = _is_available_for_specs(store, cred_id)
|
||||
specs.append(
|
||||
{
|
||||
"credential_name": name,
|
||||
|
||||
@@ -209,6 +209,7 @@ async def handle_events(request: web.Request) -> web.StreamResponse:
|
||||
EventType.TRIGGER_AVAILABLE.value,
|
||||
EventType.TRIGGER_ACTIVATED.value,
|
||||
EventType.TRIGGER_DEACTIVATED.value,
|
||||
EventType.TRIGGER_FIRED.value,
|
||||
EventType.TRIGGER_REMOVED.value,
|
||||
EventType.TRIGGER_UPDATED.value,
|
||||
}
|
||||
|
||||
@@ -16,6 +16,11 @@ from framework.server.routes_sessions import _credential_error_response
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Strong refs to background fork-finalize tasks (compaction + worker-conv
|
||||
# copy) so asyncio doesn't GC them mid-run. fork_session_into_colony
|
||||
# schedules into this set and the done-callback evicts on completion.
|
||||
_BACKGROUND_FORK_TASKS: set[asyncio.Task[None]] = set()
|
||||
|
||||
|
||||
def _load_checkpoint_run_id(cp_path) -> str | None:
|
||||
try:
|
||||
@@ -37,17 +42,15 @@ _WORKER_INHERITED_TOOLS: frozenset[str] = frozenset(
|
||||
"read_file",
|
||||
"write_file",
|
||||
"edit_file",
|
||||
"hashline_edit",
|
||||
"list_directory",
|
||||
"search_files",
|
||||
"undo_changes",
|
||||
# Shell
|
||||
"run_command",
|
||||
# Terminal (basics — exec + ripgrep + glob/find)
|
||||
"terminal_exec",
|
||||
"terminal_rg",
|
||||
"terminal_find",
|
||||
# Framework synthetics (always available to any AgentLoop node)
|
||||
"set_output",
|
||||
"escalate",
|
||||
"ask_user",
|
||||
"ask_user_multiple",
|
||||
}
|
||||
)
|
||||
|
||||
@@ -217,6 +220,25 @@ async def handle_chat(request: web.Request) -> web.Response:
|
||||
logger.debug("[handle_chat] Session resolution failed: %s", err)
|
||||
return err
|
||||
|
||||
# Sessions that have spawned a colony are locked: the user must compact +
|
||||
# fork into a fresh session before continuing the conversation. Frontend
|
||||
# surfaces this as a button instead of the textarea, but enforce server-
|
||||
# side too so the lock can't be bypassed by a stale tab or scripted call.
|
||||
if getattr(session, "colony_spawned", False):
|
||||
return web.json_response(
|
||||
{
|
||||
"error": "session_locked",
|
||||
"reason": "colony_spawned",
|
||||
"spawned_colony_name": getattr(session, "spawned_colony_name", None),
|
||||
"message": (
|
||||
"This session is locked because a colony has been "
|
||||
"spawned from it. Compact and start a new session "
|
||||
"with the same queen to continue."
|
||||
),
|
||||
},
|
||||
status=409,
|
||||
)
|
||||
|
||||
body = await request.json()
|
||||
message = body.get("message", "")
|
||||
display_message = body.get("display_message")
|
||||
@@ -663,6 +685,329 @@ async def handle_cancel_queen(request: web.Request) -> web.Response:
|
||||
return web.json_response({"cancelled": True})
|
||||
|
||||
|
||||
def persist_colony_spawn_lock(session: Any, colony_name: str) -> None:
|
||||
"""Persist the colony-spawned lock on a queen session.
|
||||
|
||||
Writes ``colony_spawned: true`` + ``spawned_colony_name`` + a timestamp
|
||||
into the queen session's ``meta.json`` and mirrors the same fields onto
|
||||
the live ``Session`` object so subsequent ``/chat`` calls in this
|
||||
process are rejected immediately without disk I/O.
|
||||
|
||||
Shared by the HTTP route ``handle_mark_colony_spawned`` (frontend
|
||||
click on the colony-link card) and the in-process ``create_colony``
|
||||
tool path (when the queen forks while in ``incubating`` phase).
|
||||
|
||||
Raises ``OSError`` if the meta.json write fails. Callers should catch
|
||||
and respond/log appropriately.
|
||||
"""
|
||||
from datetime import datetime as _dt
|
||||
|
||||
queen_dir = getattr(session, "queen_dir", None)
|
||||
if queen_dir is None:
|
||||
# Tool-side callers may invoke before the queen dir is available.
|
||||
# Still mirror onto the session so the in-process /chat guard
|
||||
# works; the meta.json write is just deferred until the next
|
||||
# session start writes the file (rare path).
|
||||
session.colony_spawned = True
|
||||
session.spawned_colony_name = colony_name
|
||||
return
|
||||
|
||||
meta_path = queen_dir / "meta.json"
|
||||
meta: dict = {}
|
||||
if meta_path.exists():
|
||||
try:
|
||||
meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
meta = {}
|
||||
|
||||
meta["colony_spawned"] = True
|
||||
meta["spawned_colony_name"] = colony_name
|
||||
meta["spawned_colony_at"] = _dt.now(UTC).isoformat()
|
||||
|
||||
meta_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
meta_path.write_text(json.dumps(meta), encoding="utf-8")
|
||||
|
||||
session.colony_spawned = True
|
||||
session.spawned_colony_name = colony_name
|
||||
|
||||
|
||||
async def handle_mark_colony_spawned(request: web.Request) -> web.Response:
|
||||
"""POST /api/sessions/{session_id}/mark-colony-spawned -- lock the queen DM.
|
||||
|
||||
Called by the frontend the first time the user clicks the
|
||||
COLONY_CREATED system message. Thin wrapper around
|
||||
:func:`persist_colony_spawn_lock` — the heavy lifting (meta.json
|
||||
merge + Session cache) lives in the helper so the in-process
|
||||
``create_colony`` path can reuse it without re-issuing an HTTP call.
|
||||
|
||||
Body: ``{"colony_name": "..."}``
|
||||
"""
|
||||
session, err = resolve_session(request)
|
||||
if err:
|
||||
return err
|
||||
|
||||
body = await request.json() if request.can_read_body else {}
|
||||
colony_name = (body.get("colony_name") or "").strip()
|
||||
if not colony_name:
|
||||
return web.json_response({"error": "colony_name is required"}, status=400)
|
||||
|
||||
try:
|
||||
persist_colony_spawn_lock(session, colony_name)
|
||||
except OSError as exc:
|
||||
logger.exception("mark_colony_spawned: failed to persist meta.json")
|
||||
return web.json_response({"error": f"failed to persist: {exc}"}, status=500)
|
||||
|
||||
return web.json_response(
|
||||
{
|
||||
"session_id": session.id,
|
||||
"colony_spawned": True,
|
||||
"spawned_colony_name": colony_name,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def _compact_queen_conversation_in_place(
|
||||
*,
|
||||
queen_dir: Any,
|
||||
queen_ctx: Any,
|
||||
queen_loop: Any,
|
||||
inherited_from: str | None = None,
|
||||
) -> tuple[int, int, str] | None:
|
||||
"""Compact ``queen_dir/conversations`` into one summary message in place.
|
||||
|
||||
Reads ``parts/`` via :class:`FileConversationStore`, runs
|
||||
:func:`llm_compact` with ``preserve_user_messages=True``, wipes
|
||||
``parts/`` + ``partials/`` and writes a single ``user``-role
|
||||
:class:`Message` (seq 0) tagged with ``inherited_from`` when provided,
|
||||
then resets ``cursor.json`` to ``next_seq=1``. ``events.jsonl`` is
|
||||
NOT touched — callers decide whether to wipe it (compact-and-fork)
|
||||
or append a boundary marker (colony fork).
|
||||
|
||||
Returns ``(messages_compacted, summary_chars, summary_text)`` on
|
||||
success, or ``None`` when there is nothing to do (no LLM ctx, no
|
||||
conversation directory, or no messages on disk). Raises on LLM or
|
||||
filesystem failure so the caller can decide between user-facing
|
||||
error response (compact-and-fork) and silent fall-through (colony
|
||||
fork keeps the raw transcript).
|
||||
"""
|
||||
import shutil as _shutil
|
||||
|
||||
from framework.agent_loop.conversation import Message
|
||||
from framework.agent_loop.internals.compaction import llm_compact
|
||||
from framework.storage.conversation_store import FileConversationStore
|
||||
|
||||
if queen_ctx is None or getattr(queen_ctx, "llm", None) is None:
|
||||
return None
|
||||
|
||||
convs_dir = queen_dir / "conversations"
|
||||
if not convs_dir.exists():
|
||||
return None
|
||||
|
||||
src_store = FileConversationStore(convs_dir)
|
||||
raw_parts = await src_store.read_parts()
|
||||
messages: list[Message] = []
|
||||
for part in raw_parts:
|
||||
try:
|
||||
messages.append(Message.from_storage_dict(part))
|
||||
except (KeyError, TypeError):
|
||||
# Skip malformed parts; the summary still covers everything else.
|
||||
logger.warning("compact_in_place: skipping malformed part %r", part)
|
||||
continue
|
||||
if not messages:
|
||||
return None
|
||||
|
||||
max_ctx_tokens = 180_000
|
||||
loop_cfg = getattr(queen_loop, "_config", None)
|
||||
if loop_cfg is not None and getattr(loop_cfg, "max_context_tokens", None):
|
||||
max_ctx_tokens = int(loop_cfg.max_context_tokens)
|
||||
|
||||
summary = await llm_compact(
|
||||
queen_ctx,
|
||||
messages,
|
||||
accumulator=None,
|
||||
max_context_tokens=max_ctx_tokens,
|
||||
preserve_user_messages=True,
|
||||
)
|
||||
|
||||
parts_dir = convs_dir / "parts"
|
||||
partials_dir = convs_dir / "partials"
|
||||
|
||||
def _wipe_stores() -> None:
|
||||
if parts_dir.exists():
|
||||
_shutil.rmtree(parts_dir)
|
||||
if partials_dir.exists():
|
||||
_shutil.rmtree(partials_dir)
|
||||
|
||||
await asyncio.to_thread(_wipe_stores)
|
||||
|
||||
summary_msg = Message(
|
||||
seq=0,
|
||||
role="user",
|
||||
content=summary,
|
||||
inherited_from=inherited_from,
|
||||
)
|
||||
dest_store = FileConversationStore(convs_dir)
|
||||
await dest_store.write_part(0, summary_msg.to_storage_dict())
|
||||
await dest_store.write_cursor({"next_seq": 1})
|
||||
|
||||
return (len(messages), len(summary), summary)
|
||||
|
||||
|
||||
async def handle_compact_and_fork(request: web.Request) -> web.Response:
|
||||
"""POST /api/sessions/{session_id}/compact-and-fork -- compact + new session.
|
||||
|
||||
The locked-by-colony-spawn UI calls this when the user clicks "compact
|
||||
+ start a new session with the same queen". The flow:
|
||||
|
||||
1. Mint a fresh session ID and copy the old queen-session dir to it.
|
||||
2. Run the shared :func:`_compact_queen_conversation_in_place` helper
|
||||
on the copy, which reads the parts, runs the LLM compactor with
|
||||
``preserve_user_messages=True``, and replaces ``parts/`` with a
|
||||
single summary message.
|
||||
3. Wipe ``events.jsonl`` so the new session presents a clean SSE
|
||||
replay (the parent's events would otherwise show up in the new
|
||||
chat as ghost history).
|
||||
4. Update meta.json (clear the parent's lock, record provenance) and
|
||||
spin up the live session.
|
||||
|
||||
The OLD session stays alive but locked; the user navigates to the
|
||||
new session via the response.
|
||||
"""
|
||||
import shutil
|
||||
import time as _time
|
||||
from datetime import datetime as _dt
|
||||
|
||||
from framework.agent_loop.types import AgentContext
|
||||
from framework.server.session_manager import (
|
||||
_generate_session_id,
|
||||
_queen_session_dir,
|
||||
)
|
||||
|
||||
session, err = resolve_session(request)
|
||||
if err:
|
||||
return err
|
||||
|
||||
queen_dir = getattr(session, "queen_dir", None)
|
||||
if queen_dir is None or not queen_dir.exists():
|
||||
return web.json_response(
|
||||
{"error": "queen session directory not found"},
|
||||
status=404,
|
||||
)
|
||||
|
||||
queen_executor = getattr(session, "queen_executor", None)
|
||||
if queen_executor is None:
|
||||
return web.json_response({"error": "queen is not running"}, status=503)
|
||||
queen_node = queen_executor.node_registry.get("queen") if queen_executor else None
|
||||
queen_ctx: AgentContext | None = getattr(queen_node, "_last_ctx", None) if queen_node else None
|
||||
if queen_ctx is None or queen_ctx.llm is None:
|
||||
return web.json_response(
|
||||
{
|
||||
"error": (
|
||||
"queen context not yet stamped (no LLM available for "
|
||||
"compaction). Send a message to the queen and retry."
|
||||
)
|
||||
},
|
||||
status=503,
|
||||
)
|
||||
|
||||
queen_name = session.queen_name or "default"
|
||||
|
||||
new_session_id = _generate_session_id()
|
||||
new_dir = _queen_session_dir(new_session_id, queen_name)
|
||||
if new_dir.exists():
|
||||
# Defensively: same-second collision would clobber another session.
|
||||
return web.json_response(
|
||||
{"error": f"new session dir collision: {new_dir}"},
|
||||
status=500,
|
||||
)
|
||||
|
||||
try:
|
||||
await asyncio.to_thread(shutil.copytree, queen_dir, new_dir)
|
||||
except OSError as exc:
|
||||
logger.exception("compact_and_fork: copytree failed")
|
||||
return web.json_response(
|
||||
{"error": f"failed to fork session dir: {exc}"},
|
||||
status=500,
|
||||
)
|
||||
|
||||
# Compact in place against the COPY so the source DM is untouched.
|
||||
# Failures here are user-visible — the whole point of the action is
|
||||
# the compacted summary.
|
||||
try:
|
||||
result = await _compact_queen_conversation_in_place(
|
||||
queen_dir=new_dir,
|
||||
queen_ctx=queen_ctx,
|
||||
queen_loop=queen_node,
|
||||
inherited_from=None, # this IS the new live session, not an inheritance
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.exception("compact_and_fork: compaction failed")
|
||||
return web.json_response(
|
||||
{"error": f"compaction failed: {exc}"},
|
||||
status=500,
|
||||
)
|
||||
if result is None:
|
||||
return web.json_response(
|
||||
{"error": "queen conversation is empty -- nothing to compact"},
|
||||
status=400,
|
||||
)
|
||||
messages_compacted, summary_chars, _summary_text = result
|
||||
|
||||
# Clean partials are already gone; also wipe events.jsonl so the new
|
||||
# session's SSE replay starts fresh (the helper deliberately leaves
|
||||
# events.jsonl alone so the colony-fork path can append a marker).
|
||||
new_events_path = new_dir / "events.jsonl"
|
||||
try:
|
||||
await asyncio.to_thread(lambda: new_events_path.exists() and new_events_path.unlink())
|
||||
except OSError:
|
||||
logger.warning("compact_and_fork: failed to wipe events.jsonl", exc_info=True)
|
||||
|
||||
# Update meta.json: clear the lock and record provenance.
|
||||
new_meta_path = new_dir / "meta.json"
|
||||
new_meta: dict = {}
|
||||
if new_meta_path.exists():
|
||||
try:
|
||||
new_meta = json.loads(new_meta_path.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
new_meta = {}
|
||||
new_meta.pop("colony_spawned", None)
|
||||
new_meta.pop("spawned_colony_name", None)
|
||||
new_meta.pop("spawned_colony_at", None)
|
||||
new_meta["queen_id"] = queen_name
|
||||
new_meta["compacted_from"] = session.id
|
||||
new_meta["compacted_at"] = _dt.now(UTC).isoformat()
|
||||
new_meta["created_at"] = _time.time()
|
||||
try:
|
||||
new_meta_path.write_text(json.dumps(new_meta), encoding="utf-8")
|
||||
except OSError:
|
||||
logger.warning("compact_and_fork: failed to write new meta.json", exc_info=True)
|
||||
|
||||
manager: Any = request.app["manager"]
|
||||
try:
|
||||
new_session = await manager.create_session(
|
||||
session_id=None,
|
||||
queen_resume_from=new_session_id,
|
||||
queen_name=queen_name,
|
||||
initial_phase="independent",
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.exception("compact_and_fork: create_session failed for forked id %s", new_session_id)
|
||||
return web.json_response(
|
||||
{"error": f"failed to start forked session: {exc}"},
|
||||
status=500,
|
||||
)
|
||||
|
||||
return web.json_response(
|
||||
{
|
||||
"new_session_id": new_session.id,
|
||||
"queen_id": queen_name,
|
||||
"compacted_from": session.id,
|
||||
"summary_chars": summary_chars,
|
||||
"messages_compacted": messages_compacted,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def handle_colony_spawn(request: web.Request) -> web.Response:
|
||||
"""POST /api/sessions/{session_id}/colony-spawn -- fork queen session into a colony.
|
||||
|
||||
@@ -710,12 +1055,102 @@ async def handle_colony_spawn(request: web.Request) -> web.Response:
|
||||
return web.json_response(result)
|
||||
|
||||
|
||||
async def _compact_inherited_conversation(
|
||||
*,
|
||||
dest_queen_dir: Any,
|
||||
queen_ctx: Any,
|
||||
queen_loop: Any,
|
||||
source_session_id: str,
|
||||
) -> None:
|
||||
"""Compact a freshly-forked colony's inherited transcript in place.
|
||||
|
||||
Thin wrapper over :func:`_compact_queen_conversation_in_place` that
|
||||
tags the resulting summary message with ``inherited_from`` and
|
||||
appends a ``colony_fork_marker`` event to the colony's
|
||||
``events.jsonl`` so the frontend can group + collapse everything
|
||||
that preceded the fork.
|
||||
|
||||
Called from ``fork_session_into_colony`` after the parent queen
|
||||
session directory has been copied into the colony's queue dir.
|
||||
|
||||
Fail-soft: any exception (compaction, write, marker append) logs a
|
||||
warning and leaves the directory as the raw copytree wrote it. The
|
||||
colony still works; it just inherits the full DM transcript instead
|
||||
of the summary.
|
||||
"""
|
||||
import json as _json
|
||||
from datetime import UTC as _UTC, datetime as _datetime
|
||||
|
||||
try:
|
||||
result = await _compact_queen_conversation_in_place(
|
||||
queen_dir=dest_queen_dir,
|
||||
queen_ctx=queen_ctx,
|
||||
queen_loop=queen_loop,
|
||||
inherited_from=source_session_id,
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"compact_inherited: compaction failed; leaving raw transcript",
|
||||
exc_info=True,
|
||||
)
|
||||
return
|
||||
|
||||
if result is None:
|
||||
# No queen ctx, no parts on disk, or empty conversation. Nothing
|
||||
# to compact and nothing to mark — the colony will just open with
|
||||
# an empty chat (or whatever raw state was copied).
|
||||
logger.info(
|
||||
"compact_inherited: nothing to compact for colony forked from %s",
|
||||
source_session_id,
|
||||
)
|
||||
return
|
||||
|
||||
messages_compacted, summary_chars, summary_text = result
|
||||
|
||||
# Append the boundary marker to the colony's events.jsonl so the
|
||||
# frontend can group + collapse everything that came before. The
|
||||
# marker carries the parent session id and a short summary preview
|
||||
# so the collapsed widget has something to label itself with even
|
||||
# before the user expands it.
|
||||
fork_iso = _datetime.now(_UTC).isoformat()
|
||||
marker = {
|
||||
"type": "colony_fork_marker",
|
||||
"stream_id": "queen",
|
||||
"data": {
|
||||
"parent_session_id": source_session_id,
|
||||
"fork_time": fork_iso,
|
||||
"summary_preview": summary_text[:240],
|
||||
"inherited_message_count": messages_compacted,
|
||||
},
|
||||
"timestamp": fork_iso,
|
||||
}
|
||||
events_path = dest_queen_dir / "events.jsonl"
|
||||
|
||||
def _append_marker() -> None:
|
||||
events_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(events_path, "a", encoding="utf-8") as f:
|
||||
f.write(_json.dumps(marker) + "\n")
|
||||
|
||||
try:
|
||||
await asyncio.to_thread(_append_marker)
|
||||
except OSError:
|
||||
logger.warning("compact_inherited: failed to append fork marker", exc_info=True)
|
||||
|
||||
logger.info(
|
||||
"compact_inherited: compacted %d parent message(s) -> 1 summary (%d chars) for colony forked from %s",
|
||||
messages_compacted,
|
||||
summary_chars,
|
||||
source_session_id,
|
||||
)
|
||||
|
||||
|
||||
async def fork_session_into_colony(
|
||||
*,
|
||||
session: Any,
|
||||
colony_name: str,
|
||||
task: str,
|
||||
tasks: list[dict] | None = None,
|
||||
concurrency_hint: int | None = None,
|
||||
) -> dict:
|
||||
"""Fork a queen session into a colony directory.
|
||||
|
||||
@@ -745,7 +1180,6 @@ async def fork_session_into_colony(
|
||||
import json
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from framework.agent_loop.agent_loop import AgentLoop, LoopConfig
|
||||
from framework.agent_loop.types import AgentContext
|
||||
@@ -809,7 +1243,9 @@ async def fork_session_into_colony(
|
||||
# would wrongly flag every fresh colony as "already-exists" if we
|
||||
# used ``not colony_dir.exists()``. A colony is "new" until its
|
||||
# worker config has actually been written.
|
||||
colony_dir = Path.home() / ".hive" / "colonies" / colony_name
|
||||
from framework.config import COLONIES_DIR
|
||||
|
||||
colony_dir = COLONIES_DIR / colony_name
|
||||
worker_name = "worker"
|
||||
worker_config_path = colony_dir / f"{worker_name}.json"
|
||||
is_new = not worker_config_path.exists()
|
||||
@@ -953,6 +1389,13 @@ async def fork_session_into_colony(
|
||||
"spawned_from": session.id,
|
||||
"spawned_at": datetime.now(UTC).isoformat(),
|
||||
}
|
||||
# Concurrency advisory baked in at incubation time. Not enforced — the
|
||||
# progress.db queue is atomic regardless — but the colony queen reads
|
||||
# this when planning fan-outs (run_parallel_workers, trigger-fired
|
||||
# batches) so behavior matches what the user agreed to during
|
||||
# incubation.
|
||||
if isinstance(concurrency_hint, int) and concurrency_hint > 0:
|
||||
worker_meta["concurrency_hint"] = concurrency_hint
|
||||
worker_config_path.write_text(json.dumps(worker_meta, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
|
||||
# ── 3. Duplicate queen session into colony ───────────────────
|
||||
@@ -993,6 +1436,11 @@ async def fork_session_into_colony(
|
||||
dest_meta["queen_id"] = queen_name
|
||||
dest_meta["forked_from"] = session.id
|
||||
dest_meta["colony_fork"] = True # exclude from queen DM history
|
||||
# Clear any colony_spawned lock that came over from the parent meta —
|
||||
# it was the PARENT session that locked, not this freshly-forked one.
|
||||
dest_meta.pop("colony_spawned", None)
|
||||
dest_meta.pop("spawned_colony_name", None)
|
||||
dest_meta.pop("spawned_colony_at", None)
|
||||
dest_meta_path.write_text(json.dumps(dest_meta, ensure_ascii=False), encoding="utf-8")
|
||||
logger.info(
|
||||
"Duplicated queen session %s -> %s for colony '%s'",
|
||||
@@ -1000,15 +1448,110 @@ async def fork_session_into_colony(
|
||||
colony_session_id,
|
||||
colony_name,
|
||||
)
|
||||
# Copy queen conversations into worker storage so the worker
|
||||
# starts with the queen's full context.
|
||||
worker_storage = Path.home() / ".hive" / "agents" / colony_name / worker_name
|
||||
worker_storage.mkdir(parents=True, exist_ok=True)
|
||||
worker_conv_dir = worker_storage / "conversations"
|
||||
source_conv_dir = dest_queen_dir / "conversations"
|
||||
if source_conv_dir.exists():
|
||||
await asyncio.to_thread(shutil.copytree, source_conv_dir, worker_conv_dir, dirs_exist_ok=True)
|
||||
logger.info("Copied queen conversations to worker storage %s", worker_conv_dir)
|
||||
|
||||
# ── 3a. Compact the inherited conversation (fire-and-forget) ──
|
||||
# The colony queen doesn't need the full DM transcript — that
|
||||
# transcript was about REACHING the decision to fork, which is
|
||||
# now settled. Compaction replaces the copied parts with a
|
||||
# single summary message tagged ``inherited_from``.
|
||||
#
|
||||
# Compaction issues an LLM call that can legitimately exceed
|
||||
# the 60s tool-call timeout, so we schedule it (plus the
|
||||
# downstream worker-storage copy) as a background task and
|
||||
# return immediately. A compaction_status.json marker in
|
||||
# dest_queen_dir lets a subsequent colony session-load await
|
||||
# completion before reading the conversation files (see
|
||||
# session_manager.create_session_with_worker_colony).
|
||||
#
|
||||
# Fail-soft: any exception is logged and recorded in the
|
||||
# marker; the colony still works with the raw transcript.
|
||||
from framework.server import compaction_status
|
||||
|
||||
compaction_status.mark_in_progress(dest_queen_dir)
|
||||
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
_worker_storage = HIVE_HOME / "agents" / colony_name / worker_name
|
||||
_dest_queen_dir = dest_queen_dir
|
||||
_queen_ctx = queen_ctx
|
||||
_queen_loop = queen_loop
|
||||
_source_session_id = session.id
|
||||
|
||||
# Wall-clock cap on the background compaction's LLM call.
|
||||
# Without this a hung/misbehaving model (seen with local
|
||||
# endpoints) leaves compaction_status="in_progress" forever and
|
||||
# the colony-open await_completion waste its full poll window
|
||||
# before giving up. When this fires we still fall through to
|
||||
# the worker-storage copy below so the colony opens with the
|
||||
# raw transcript instead of empty state.
|
||||
_COMPACTION_TIMEOUT_SECONDS = 180.0
|
||||
|
||||
async def _finalize_fork() -> None:
|
||||
compaction_error: str | None = None
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
_compact_inherited_conversation(
|
||||
dest_queen_dir=_dest_queen_dir,
|
||||
queen_ctx=_queen_ctx,
|
||||
queen_loop=_queen_loop,
|
||||
source_session_id=_source_session_id,
|
||||
),
|
||||
timeout=_COMPACTION_TIMEOUT_SECONDS,
|
||||
)
|
||||
except TimeoutError:
|
||||
compaction_error = (
|
||||
f"compaction timed out after {_COMPACTION_TIMEOUT_SECONDS:.0f}s (falling back to raw transcript)"
|
||||
)
|
||||
logger.warning(
|
||||
"fork_session_into_colony: %s for %s",
|
||||
compaction_error,
|
||||
_dest_queen_dir,
|
||||
)
|
||||
except Exception as exc:
|
||||
compaction_error = f"compaction failed: {exc}"
|
||||
logger.warning(
|
||||
"fork_session_into_colony: %s for %s (falling back to raw transcript)",
|
||||
compaction_error,
|
||||
_dest_queen_dir,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
# Worker storage copy runs regardless of the compaction
|
||||
# outcome. If compaction succeeded, the worker gets the
|
||||
# summary; if it failed / timed out, dest_queen_dir still
|
||||
# has the raw transcript from the earlier copytree and the
|
||||
# worker gets that. Without this copy-on-failure the worker
|
||||
# would open to empty state on every compaction hiccup.
|
||||
try:
|
||||
_worker_storage.mkdir(parents=True, exist_ok=True)
|
||||
worker_conv_dir = _worker_storage / "conversations"
|
||||
source_conv_dir = _dest_queen_dir / "conversations"
|
||||
if source_conv_dir.exists():
|
||||
await asyncio.to_thread(
|
||||
shutil.copytree,
|
||||
source_conv_dir,
|
||||
worker_conv_dir,
|
||||
dirs_exist_ok=True,
|
||||
)
|
||||
logger.info(
|
||||
"Copied queen conversations to worker storage %s",
|
||||
worker_conv_dir,
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"fork_session_into_colony: worker-storage copy failed for %s",
|
||||
_worker_storage,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
if compaction_error:
|
||||
compaction_status.mark_failed(_dest_queen_dir, compaction_error)
|
||||
else:
|
||||
compaction_status.mark_done(_dest_queen_dir)
|
||||
|
||||
_bg_task = asyncio.create_task(_finalize_fork())
|
||||
_BACKGROUND_FORK_TASKS.add(_bg_task)
|
||||
_bg_task.add_done_callback(_BACKGROUND_FORK_TASKS.discard)
|
||||
else:
|
||||
logger.warning(
|
||||
"Queen session dir %s not found, colony will start fresh",
|
||||
@@ -1036,6 +1579,39 @@ async def fork_session_into_colony(
|
||||
}
|
||||
metadata_path.write_text(json.dumps(metadata, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
|
||||
# ── 4a. Inherit the queen's tool allowlist into the colony ───
|
||||
# A colony forked from a curated queen should start with the same
|
||||
# tool surface (otherwise the colony silently falls back to its own
|
||||
# "allow every MCP tool" default, undoing the parent's curation).
|
||||
# We copy the queen's LIVE effective allowlist so the snapshot
|
||||
# reflects whatever was in force the moment the user clicked "Create
|
||||
# Colony". Users can further narrow the colony via the Tool Library.
|
||||
# Skip the write when the queen is on allow-all (None) so the colony
|
||||
# keeps the same semantics without creating an inert sidecar.
|
||||
try:
|
||||
queen_enabled = getattr(
|
||||
getattr(session, "phase_state", None),
|
||||
"enabled_mcp_tools",
|
||||
None,
|
||||
)
|
||||
if isinstance(queen_enabled, list):
|
||||
from framework.host.colony_tools_config import update_colony_tools_config
|
||||
|
||||
update_colony_tools_config(colony_name, list(queen_enabled))
|
||||
logger.info(
|
||||
"Inherited queen allowlist into colony '%s' (%d tools)",
|
||||
colony_name,
|
||||
len(queen_enabled),
|
||||
)
|
||||
except Exception:
|
||||
# Inheritance is best-effort — don't let a tools.json hiccup
|
||||
# abort colony creation.
|
||||
logger.warning(
|
||||
"Failed to inherit queen allowlist into colony '%s'",
|
||||
colony_name,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
# ── 5. Update source queen session meta.json ─────────────────
|
||||
# Link the originating session back to the colony for discovery.
|
||||
source_meta_path = source_queen_dir / "meta.json"
|
||||
@@ -1068,6 +1644,12 @@ async def fork_session_into_colony(
|
||||
"is_new": is_new,
|
||||
"db_path": str(db_path),
|
||||
"task_ids": seeded_task_ids,
|
||||
# "in_progress" when a background compactor was scheduled above,
|
||||
# "skipped" when the source queen dir was missing (nothing to
|
||||
# compact). Frontend uses this to decide whether to display a
|
||||
# "preparing colony…" state while session-load blocks on the
|
||||
# compaction marker.
|
||||
"compaction_status": ("in_progress" if source_queen_dir.exists() else "skipped"),
|
||||
}
|
||||
|
||||
|
||||
@@ -1085,3 +1667,11 @@ def register_routes(app: web.Application) -> None:
|
||||
app.router.add_post("/api/sessions/{session_id}/replay", handle_replay)
|
||||
app.router.add_get("/api/sessions/{session_id}/goal-progress", handle_goal_progress)
|
||||
app.router.add_post("/api/sessions/{session_id}/colony-spawn", handle_colony_spawn)
|
||||
app.router.add_post(
|
||||
"/api/sessions/{session_id}/mark-colony-spawned",
|
||||
handle_mark_colony_spawned,
|
||||
)
|
||||
app.router.add_post(
|
||||
"/api/sessions/{session_id}/compact-and-fork",
|
||||
handle_compact_and_fork,
|
||||
)
|
||||
|
||||
@@ -0,0 +1,291 @@
|
||||
"""MCP server registration routes.
|
||||
|
||||
Thin HTTP wrapper around ``MCPRegistry`` so the frontend can add, remove,
|
||||
enable, and health-check user-registered MCP servers. The CLI path
|
||||
(``hive mcp add`` / ``hive mcp remove`` / etc.) is unchanged.
|
||||
|
||||
- GET /api/mcp/servers -- list installed servers
|
||||
- POST /api/mcp/servers -- register a local server
|
||||
- DELETE /api/mcp/servers/{name} -- remove a local server
|
||||
- POST /api/mcp/servers/{name}/enable -- enable a server
|
||||
- POST /api/mcp/servers/{name}/disable -- disable a server
|
||||
- POST /api/mcp/servers/{name}/health -- probe server health
|
||||
|
||||
New servers take effect on the *next* queen session start. Existing live
|
||||
queen sessions keep the tool list they booted with to avoid mid-turn
|
||||
cache invalidation. The ``add`` response hints at this explicitly.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from aiohttp import web
|
||||
|
||||
from framework.loader.mcp_errors import MCPError
|
||||
from framework.loader.mcp_registry import MCPRegistry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_VALID_TRANSPORTS = {"stdio", "http", "sse", "unix"}
|
||||
|
||||
|
||||
def _registry() -> MCPRegistry:
|
||||
# MCPRegistry is a thin wrapper around ~/.hive/mcp_registry/installed.json
|
||||
# so instantiation is cheap — no need to cache on app["..."].
|
||||
reg = MCPRegistry()
|
||||
reg.initialize()
|
||||
return reg
|
||||
|
||||
|
||||
def _package_builtin_servers() -> list[dict[str, Any]]:
|
||||
"""Return the package-baked queen MCP servers from ``queen/mcp_servers.json``.
|
||||
|
||||
Those servers are loaded directly by ``ToolRegistry.load_mcp_config``
|
||||
at queen boot and never go through ``MCPRegistry.list_installed``,
|
||||
so the raw registry view shows them as missing. Surface them here so
|
||||
the Tool Library reflects what the queen actually talks to.
|
||||
|
||||
Entries carry ``source: "built-in"`` and are NOT removable / toggleable
|
||||
— editing them requires changing the repo file.
|
||||
"""
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import framework.agents.queen as _queen_pkg
|
||||
|
||||
path = Path(_queen_pkg.__file__).parent / "mcp_servers.json"
|
||||
if not path.exists():
|
||||
return []
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return []
|
||||
|
||||
out: list[dict[str, Any]] = []
|
||||
for name, cfg in data.items():
|
||||
if not isinstance(cfg, dict):
|
||||
continue
|
||||
out.append(
|
||||
{
|
||||
"name": name,
|
||||
"source": "built-in",
|
||||
"transport": cfg.get("transport", "stdio"),
|
||||
"description": cfg.get("description", "") or "",
|
||||
"enabled": True,
|
||||
"last_health_status": None,
|
||||
"last_error": None,
|
||||
"last_health_check_at": None,
|
||||
"tool_count": None,
|
||||
"removable": False,
|
||||
}
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
def _server_to_summary(entry: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Shape an installed.json entry for API responses.
|
||||
|
||||
Strips the full manifest body (which can be large) but keeps the tool
|
||||
list if the manifest already embeds one (happens for registry-installed
|
||||
servers). Users with ``source: "local"`` only get a tool list after
|
||||
running a health check.
|
||||
"""
|
||||
manifest = entry.get("manifest") or {}
|
||||
tools = manifest.get("tools") if isinstance(manifest, dict) else None
|
||||
if not isinstance(tools, list):
|
||||
tools = None
|
||||
return {
|
||||
"name": entry.get("name"),
|
||||
"source": entry.get("source"),
|
||||
"transport": entry.get("transport"),
|
||||
"description": (manifest.get("description") if isinstance(manifest, dict) else None) or "",
|
||||
"enabled": entry.get("enabled", True),
|
||||
"last_health_status": entry.get("last_health_status"),
|
||||
"last_error": entry.get("last_error"),
|
||||
"last_health_check_at": entry.get("last_health_check_at"),
|
||||
"tool_count": (len(tools) if tools is not None else None),
|
||||
}
|
||||
|
||||
|
||||
def _mcp_error_response(exc: MCPError, *, default_status: int = 400) -> web.Response:
|
||||
return web.json_response(
|
||||
{
|
||||
"error": exc.what,
|
||||
"code": exc.code.value,
|
||||
"what": exc.what,
|
||||
"why": exc.why,
|
||||
"fix": exc.fix,
|
||||
},
|
||||
status=default_status,
|
||||
)
|
||||
|
||||
|
||||
async def handle_list_servers(request: web.Request) -> web.Response:
|
||||
"""GET /api/mcp/servers — list every server the queen actually uses.
|
||||
|
||||
Merges two sources:
|
||||
|
||||
- ``MCPRegistry.list_installed()`` — servers registered via
|
||||
``hive mcp add`` / the ``/api/mcp/servers`` POST route, stored in
|
||||
``~/.hive/mcp_registry/installed.json``. These carry
|
||||
``source: "local"`` (user-added) or ``source: "registry"``
|
||||
(installed from the remote registry).
|
||||
- Repo-baked queen servers from
|
||||
``core/framework/agents/queen/mcp_servers.json``. These are loaded
|
||||
directly by the queen's ``ToolRegistry`` at boot and never touch
|
||||
``MCPRegistry``; we surface them here so the UI reflects what the
|
||||
queen really talks to. They are not removable from the UI because
|
||||
editing them requires changing the repo.
|
||||
|
||||
If a name collides between the two sources, the registry entry wins
|
||||
because that's the one the user has customized.
|
||||
"""
|
||||
reg = _registry()
|
||||
registry_entries = [_server_to_summary(e) for e in reg.list_installed()]
|
||||
seen_names = {e.get("name") for e in registry_entries}
|
||||
|
||||
package_entries = [e for e in _package_builtin_servers() if e.get("name") not in seen_names]
|
||||
|
||||
servers = [*package_entries, *registry_entries]
|
||||
return web.json_response({"servers": servers})
|
||||
|
||||
|
||||
async def handle_add_server(request: web.Request) -> web.Response:
|
||||
"""POST /api/mcp/servers — register a local MCP server.
|
||||
|
||||
Body mirrors ``MCPRegistry.add_local`` args:
|
||||
|
||||
::
|
||||
|
||||
{
|
||||
"name": "my-tool",
|
||||
"transport": "stdio" | "http" | "sse" | "unix",
|
||||
"command": "...", "args": [...], "env": {...}, "cwd": "...",
|
||||
"url": "...", "headers": {...},
|
||||
"socket_path": "...",
|
||||
"description": "..."
|
||||
}
|
||||
"""
|
||||
try:
|
||||
body = await request.json()
|
||||
except Exception:
|
||||
return web.json_response({"error": "Invalid JSON body"}, status=400)
|
||||
if not isinstance(body, dict):
|
||||
return web.json_response({"error": "Body must be a JSON object"}, status=400)
|
||||
|
||||
name = body.get("name")
|
||||
transport = body.get("transport")
|
||||
if not isinstance(name, str) or not name.strip():
|
||||
return web.json_response({"error": "'name' is required"}, status=400)
|
||||
if transport not in _VALID_TRANSPORTS:
|
||||
return web.json_response(
|
||||
{"error": f"'transport' must be one of {sorted(_VALID_TRANSPORTS)}"},
|
||||
status=400,
|
||||
)
|
||||
|
||||
reg = _registry()
|
||||
try:
|
||||
entry = reg.add_local(
|
||||
name=name.strip(),
|
||||
transport=transport,
|
||||
command=body.get("command"),
|
||||
args=body.get("args"),
|
||||
env=body.get("env"),
|
||||
cwd=body.get("cwd"),
|
||||
url=body.get("url"),
|
||||
headers=body.get("headers"),
|
||||
socket_path=body.get("socket_path"),
|
||||
description=body.get("description", ""),
|
||||
)
|
||||
except MCPError as exc:
|
||||
status = 409 if "already exists" in exc.what else 400
|
||||
return _mcp_error_response(exc, default_status=status)
|
||||
except Exception as exc:
|
||||
logger.exception("MCP add_local failed for %r", name)
|
||||
return web.json_response({"error": str(exc)}, status=500)
|
||||
|
||||
summary = _server_to_summary({"name": name, **entry})
|
||||
return web.json_response(
|
||||
{
|
||||
"server": summary,
|
||||
"hint": "Start a new queen session to use this server's tools.",
|
||||
},
|
||||
status=201,
|
||||
)
|
||||
|
||||
|
||||
async def handle_remove_server(request: web.Request) -> web.Response:
|
||||
"""DELETE /api/mcp/servers/{name} — remove a local server."""
|
||||
name = request.match_info["name"]
|
||||
reg = _registry()
|
||||
|
||||
existing = reg.get_server(name)
|
||||
if existing is None:
|
||||
return web.json_response({"error": f"Server '{name}' not installed"}, status=404)
|
||||
if existing.get("source") != "local":
|
||||
return web.json_response(
|
||||
{
|
||||
"error": f"Server '{name}' is a built-in; it cannot be removed from the UI.",
|
||||
},
|
||||
status=400,
|
||||
)
|
||||
|
||||
try:
|
||||
reg.remove(name)
|
||||
except MCPError as exc:
|
||||
return _mcp_error_response(exc, default_status=404)
|
||||
return web.json_response({"removed": name})
|
||||
|
||||
|
||||
async def handle_set_enabled(request: web.Request, *, enabled: bool) -> web.Response:
|
||||
name = request.match_info["name"]
|
||||
reg = _registry()
|
||||
try:
|
||||
if enabled:
|
||||
reg.enable(name)
|
||||
else:
|
||||
reg.disable(name)
|
||||
except MCPError as exc:
|
||||
return _mcp_error_response(exc, default_status=404)
|
||||
return web.json_response({"name": name, "enabled": enabled})
|
||||
|
||||
|
||||
async def handle_enable(request: web.Request) -> web.Response:
|
||||
"""POST /api/mcp/servers/{name}/enable."""
|
||||
return await handle_set_enabled(request, enabled=True)
|
||||
|
||||
|
||||
async def handle_disable(request: web.Request) -> web.Response:
|
||||
"""POST /api/mcp/servers/{name}/disable."""
|
||||
return await handle_set_enabled(request, enabled=False)
|
||||
|
||||
|
||||
async def handle_health(request: web.Request) -> web.Response:
|
||||
"""POST /api/mcp/servers/{name}/health — probe one server."""
|
||||
name = request.match_info["name"]
|
||||
reg = _registry()
|
||||
try:
|
||||
# MCPRegistry.health_check blocks on subprocess IO — run it off
|
||||
# the event loop so the HTTP worker stays responsive.
|
||||
import asyncio
|
||||
|
||||
result = await asyncio.to_thread(reg.health_check, name)
|
||||
except MCPError as exc:
|
||||
return _mcp_error_response(exc, default_status=404)
|
||||
except Exception as exc:
|
||||
logger.exception("MCP health_check failed for %r", name)
|
||||
return web.json_response({"error": str(exc)}, status=500)
|
||||
return web.json_response(result)
|
||||
|
||||
|
||||
def register_routes(app: web.Application) -> None:
|
||||
"""Register MCP server CRUD routes."""
|
||||
app.router.add_get("/api/mcp/servers", handle_list_servers)
|
||||
app.router.add_post("/api/mcp/servers", handle_add_server)
|
||||
app.router.add_delete("/api/mcp/servers/{name}", handle_remove_server)
|
||||
app.router.add_post("/api/mcp/servers/{name}/enable", handle_enable)
|
||||
app.router.add_post("/api/mcp/servers/{name}/disable", handle_disable)
|
||||
app.router.add_post("/api/mcp/servers/{name}/health", handle_health)
|
||||
@@ -0,0 +1,537 @@
|
||||
"""Per-queen MCP tool allowlist routes.
|
||||
|
||||
- GET /api/queen/{queen_id}/tools -- enumerate the queen's tool surface
|
||||
- PATCH /api/queen/{queen_id}/tools -- set or clear the MCP tool allowlist
|
||||
|
||||
Lifecycle and synthetic tools (``ask_user``) are always part of the queen's
|
||||
surface in INDEPENDENT mode and are returned with ``editable: false``. MCP
|
||||
tools are grouped by origin server and carry per-tool ``enabled`` flags.
|
||||
|
||||
The allowlist is persisted in a dedicated ``tools.json`` sidecar at
|
||||
``~/.hive/agents/queens/{queen_id}/tools.json``:
|
||||
|
||||
- ``null`` / missing file -> "allow every MCP tool" (default)
|
||||
- ``[]`` -> explicitly disable every MCP tool
|
||||
- ``["foo", "bar"]`` -> only these MCP tools pass through to the LLM
|
||||
|
||||
Filtering happens in ``QueenPhaseState.rebuild_independent_filter`` so the
|
||||
LLM prompt cache stays warm between saves.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from aiohttp import web
|
||||
|
||||
from framework.agents.queen.queen_profiles import (
|
||||
ensure_default_queens,
|
||||
load_queen_profile,
|
||||
)
|
||||
from framework.agents.queen.queen_tools_config import (
|
||||
delete_queen_tools_config,
|
||||
load_queen_tools_config,
|
||||
tools_config_exists,
|
||||
update_queen_tools_config,
|
||||
)
|
||||
from framework.agents.queen.queen_tools_defaults import (
|
||||
list_category_names,
|
||||
queen_role_categories,
|
||||
resolve_category_tools,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_SYNTHETIC_NAMES = {"ask_user"}
|
||||
|
||||
|
||||
async def _ensure_manager_catalog(manager: Any) -> dict[str, list[dict[str, Any]]]:
|
||||
"""Return the cached MCP tool catalog, building it on first call.
|
||||
|
||||
``queen_orchestrator.create_queen`` populates ``_mcp_tool_catalog`` on
|
||||
every queen boot. On a fresh backend process the user may open the
|
||||
Tool Library before any queen session has started, so the catalog is
|
||||
empty. In that case we build one from the shared MCP config; the
|
||||
first call pays an MCP-subprocess-spawn cost, subsequent calls are
|
||||
cache hits. The build runs off the event loop via asyncio.to_thread
|
||||
so the HTTP worker stays responsive while MCP servers initialize.
|
||||
"""
|
||||
if manager is None:
|
||||
return {}
|
||||
catalog = getattr(manager, "_mcp_tool_catalog", None)
|
||||
if isinstance(catalog, dict) and catalog:
|
||||
return catalog
|
||||
try:
|
||||
import asyncio
|
||||
|
||||
from framework.server.queen_orchestrator import build_queen_tool_registry_bare
|
||||
|
||||
registry, built = await asyncio.to_thread(build_queen_tool_registry_bare)
|
||||
manager._mcp_tool_catalog = built # type: ignore[attr-defined]
|
||||
manager._bootstrap_tool_registry = registry # type: ignore[attr-defined]
|
||||
return built
|
||||
except Exception:
|
||||
logger.warning("Tool catalog bootstrap failed", exc_info=True)
|
||||
return {}
|
||||
|
||||
|
||||
def _lifecycle_entries_without_session(
|
||||
manager: Any,
|
||||
mcp_names: set[str],
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Derive lifecycle tool names from the registry even without a session.
|
||||
|
||||
We register queen lifecycle tools against a temporary registry using a
|
||||
minimal stub, then subtract the MCP-origin set and the synthetic set.
|
||||
The result matches what the queen sees at runtime (minus context-
|
||||
specific variants).
|
||||
"""
|
||||
registry = getattr(manager, "_bootstrap_tool_registry", None)
|
||||
# If the bootstrap registry exists but doesn't carry lifecycle tools
|
||||
# yet, register them now.
|
||||
if registry is not None and not getattr(registry, "_lifecycle_bootstrap_done", False):
|
||||
try:
|
||||
from types import SimpleNamespace
|
||||
|
||||
from framework.tools.queen_lifecycle_tools import register_queen_lifecycle_tools
|
||||
|
||||
stub_session = SimpleNamespace(
|
||||
id="tool-library-bootstrap",
|
||||
colony_runtime=None,
|
||||
event_bus=None,
|
||||
worker_path=None,
|
||||
phase_state=None,
|
||||
llm=None,
|
||||
)
|
||||
register_queen_lifecycle_tools(
|
||||
registry,
|
||||
session=stub_session,
|
||||
session_id=stub_session.id,
|
||||
session_manager=None,
|
||||
manager_session_id=stub_session.id,
|
||||
phase_state=None,
|
||||
)
|
||||
registry._lifecycle_bootstrap_done = True # type: ignore[attr-defined]
|
||||
except Exception:
|
||||
logger.debug("lifecycle bootstrap failed", exc_info=True)
|
||||
|
||||
if registry is None:
|
||||
return []
|
||||
|
||||
out: list[dict[str, Any]] = []
|
||||
for name, tool in sorted(registry.get_tools().items()):
|
||||
if name in mcp_names or name in _SYNTHETIC_NAMES:
|
||||
continue
|
||||
out.append(
|
||||
{
|
||||
"name": tool.name,
|
||||
"description": tool.description,
|
||||
"editable": False,
|
||||
}
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
def _synthetic_entries() -> list[dict[str, Any]]:
|
||||
"""Return display metadata for synthetic tools injected by the agent loop.
|
||||
|
||||
Kept behind a lazy import so test harnesses that don't wire the agent
|
||||
loop can still hit this route without blowing up.
|
||||
"""
|
||||
try:
|
||||
from framework.agent_loop.internals.synthetic_tools import build_ask_user_tool
|
||||
|
||||
tool = build_ask_user_tool()
|
||||
return [
|
||||
{
|
||||
"name": tool.name,
|
||||
"description": tool.description,
|
||||
"editable": False,
|
||||
}
|
||||
]
|
||||
except Exception:
|
||||
return [
|
||||
{
|
||||
"name": "ask_user",
|
||||
"description": "Pause and ask the user a structured question.",
|
||||
"editable": False,
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _live_queen_session(manager: Any, queen_id: str) -> Any:
|
||||
"""Return any live DM session owned by this queen, or ``None``."""
|
||||
sessions = getattr(manager, "_sessions", None) or {}
|
||||
for session in sessions.values():
|
||||
if getattr(session, "queen_name", None) != queen_id:
|
||||
continue
|
||||
# Prefer DM (non-colony) sessions
|
||||
if getattr(session, "colony_runtime", None) is None:
|
||||
return session
|
||||
return None
|
||||
|
||||
|
||||
def _render_mcp_servers(
|
||||
*,
|
||||
mcp_tool_names_by_server: dict[str, list[dict[str, Any]]],
|
||||
enabled_mcp_tools: list[str] | None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Shape the mcp_tool_catalog entries for the API response."""
|
||||
allowed: set[str] | None = None if enabled_mcp_tools is None else set(enabled_mcp_tools)
|
||||
servers: list[dict[str, Any]] = []
|
||||
for server_name in sorted(mcp_tool_names_by_server):
|
||||
entries = mcp_tool_names_by_server[server_name]
|
||||
tools = []
|
||||
for entry in entries:
|
||||
name = entry.get("name")
|
||||
enabled = True if allowed is None else name in allowed
|
||||
tools.append(
|
||||
{
|
||||
"name": name,
|
||||
"description": entry.get("description", ""),
|
||||
"input_schema": entry.get("input_schema", {}),
|
||||
"enabled": enabled,
|
||||
}
|
||||
)
|
||||
servers.append({"name": server_name, "tools": tools})
|
||||
return servers
|
||||
|
||||
|
||||
def _catalog_from_live_session(session: Any) -> dict[str, list[dict[str, Any]]]:
|
||||
"""Rebuild a per-server tool catalog from a live queen session.
|
||||
|
||||
The session's registry is authoritative — this reflects any hot-added
|
||||
MCP servers since the manager-level snapshot was cached.
|
||||
"""
|
||||
registry = getattr(session, "_queen_tool_registry", None)
|
||||
if registry is None:
|
||||
# session._queen_tools_by_name is a stash from create_queen; we
|
||||
# only have registry via the tools list, so reconstruct from the
|
||||
# phase state instead.
|
||||
phase_state = getattr(session, "phase_state", None)
|
||||
if phase_state is None:
|
||||
return {}
|
||||
mcp_names = getattr(phase_state, "mcp_tool_names_all", set()) or set()
|
||||
independent_tools = getattr(phase_state, "independent_tools", []) or []
|
||||
result: dict[str, list[dict[str, Any]]] = {"MCP Tools": []}
|
||||
for tool in independent_tools:
|
||||
if tool.name not in mcp_names:
|
||||
continue
|
||||
result["MCP Tools"].append(
|
||||
{
|
||||
"name": tool.name,
|
||||
"description": tool.description,
|
||||
"input_schema": tool.parameters,
|
||||
}
|
||||
)
|
||||
return result if result["MCP Tools"] else {}
|
||||
|
||||
server_map = getattr(registry, "_mcp_server_tools", {}) or {}
|
||||
tools_by_name = {t.name: t for t in registry.get_tools().values()}
|
||||
catalog: dict[str, list[dict[str, Any]]] = {}
|
||||
for server_name, tool_names in server_map.items():
|
||||
entries: list[dict[str, Any]] = []
|
||||
for name in sorted(tool_names):
|
||||
tool = tools_by_name.get(name)
|
||||
if tool is None:
|
||||
continue
|
||||
entries.append(
|
||||
{
|
||||
"name": tool.name,
|
||||
"description": tool.description,
|
||||
"input_schema": tool.parameters,
|
||||
}
|
||||
)
|
||||
catalog[server_name] = entries
|
||||
return catalog
|
||||
|
||||
|
||||
def _lifecycle_entries(
|
||||
*,
|
||||
session: Any,
|
||||
mcp_tool_names_all: set[str],
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Lifecycle tools = independent_tools minus MCP-origin minus synthetic.
|
||||
|
||||
We compute this from a live session when available so the list exactly
|
||||
matches what the queen actually sees on her next turn.
|
||||
"""
|
||||
if session is None:
|
||||
return []
|
||||
phase_state = getattr(session, "phase_state", None)
|
||||
if phase_state is None:
|
||||
return []
|
||||
result: list[dict[str, Any]] = []
|
||||
for tool in getattr(phase_state, "independent_tools", []) or []:
|
||||
if tool.name in mcp_tool_names_all:
|
||||
continue
|
||||
if tool.name in _SYNTHETIC_NAMES:
|
||||
continue
|
||||
result.append(
|
||||
{
|
||||
"name": tool.name,
|
||||
"description": tool.description,
|
||||
"editable": False,
|
||||
}
|
||||
)
|
||||
return sorted(result, key=lambda x: x["name"])
|
||||
|
||||
|
||||
async def handle_get_tools(request: web.Request) -> web.Response:
|
||||
"""GET /api/queen/{queen_id}/tools — enumerate tool surface for the UI."""
|
||||
queen_id = request.match_info["queen_id"]
|
||||
ensure_default_queens()
|
||||
try:
|
||||
load_queen_profile(queen_id)
|
||||
except FileNotFoundError:
|
||||
return web.json_response({"error": f"Queen '{queen_id}' not found"}, status=404)
|
||||
|
||||
manager = request.app.get("manager")
|
||||
session = _live_queen_session(manager, queen_id) if manager is not None else None
|
||||
|
||||
# Prefer a live session's registry for freshness. Otherwise use (or
|
||||
# build on demand) the manager-level catalog so the Tool Library works
|
||||
# even before any queen has been started in this process.
|
||||
if session is not None:
|
||||
catalog = _catalog_from_live_session(session)
|
||||
else:
|
||||
catalog = await _ensure_manager_catalog(manager)
|
||||
stale = not catalog
|
||||
|
||||
mcp_tool_names_all: set[str] = set()
|
||||
for entries in catalog.values():
|
||||
for entry in entries:
|
||||
if entry.get("name"):
|
||||
mcp_tool_names_all.add(entry["name"])
|
||||
|
||||
if session is not None:
|
||||
lifecycle = _lifecycle_entries(
|
||||
session=session,
|
||||
mcp_tool_names_all=mcp_tool_names_all,
|
||||
)
|
||||
else:
|
||||
lifecycle = _lifecycle_entries_without_session(manager, mcp_tool_names_all)
|
||||
|
||||
# Allowlist lives in the dedicated tools.json sidecar; helper
|
||||
# migrates legacy profile.yaml field on first read, and falls back
|
||||
# to the role-based default when no sidecar exists.
|
||||
enabled_mcp_tools = load_queen_tools_config(queen_id, mcp_catalog=catalog)
|
||||
is_role_default = not tools_config_exists(queen_id)
|
||||
|
||||
response = {
|
||||
"queen_id": queen_id,
|
||||
"enabled_mcp_tools": enabled_mcp_tools,
|
||||
"is_role_default": is_role_default,
|
||||
"stale": stale,
|
||||
"lifecycle": lifecycle,
|
||||
"synthetic": _synthetic_entries(),
|
||||
"mcp_servers": _render_mcp_servers(
|
||||
mcp_tool_names_by_server=catalog,
|
||||
enabled_mcp_tools=enabled_mcp_tools,
|
||||
),
|
||||
"categories": _render_categories(queen_id, catalog),
|
||||
}
|
||||
return web.json_response(response)
|
||||
|
||||
|
||||
def _render_categories(
|
||||
queen_id: str,
|
||||
mcp_catalog: dict[str, list[dict[str, Any]]],
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Expose the role-default category table to the frontend.
|
||||
|
||||
Each entry carries the category name, the resolved member tool names
|
||||
(after ``@server:NAME`` shorthand expansion against the live catalog),
|
||||
and ``in_role_default`` to flag categories that contribute to this
|
||||
queen's role-based default. Lets the Tool Library group tools by
|
||||
category alongside the per-server view.
|
||||
"""
|
||||
applied = set(queen_role_categories(queen_id))
|
||||
out: list[dict[str, Any]] = []
|
||||
for name in list_category_names():
|
||||
out.append(
|
||||
{
|
||||
"name": name,
|
||||
"tools": resolve_category_tools(name, mcp_catalog),
|
||||
"in_role_default": name in applied,
|
||||
}
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
async def handle_patch_tools(request: web.Request) -> web.Response:
|
||||
"""PATCH /api/queen/{queen_id}/tools — persist the MCP tool allowlist.
|
||||
|
||||
Body: ``{"enabled_mcp_tools": null | string[]}``.
|
||||
|
||||
- ``null`` resets to "allow every MCP tool" (default).
|
||||
- A list is validated against the known MCP catalog; unknown names
|
||||
are rejected with 400 so the frontend catches typos.
|
||||
"""
|
||||
queen_id = request.match_info["queen_id"]
|
||||
try:
|
||||
body = await request.json()
|
||||
except Exception:
|
||||
return web.json_response({"error": "Invalid JSON body"}, status=400)
|
||||
if not isinstance(body, dict) or "enabled_mcp_tools" not in body:
|
||||
return web.json_response(
|
||||
{"error": "Body must be an object with an 'enabled_mcp_tools' field"},
|
||||
status=400,
|
||||
)
|
||||
|
||||
enabled = body["enabled_mcp_tools"]
|
||||
if enabled is not None:
|
||||
if not isinstance(enabled, list) or not all(isinstance(x, str) for x in enabled):
|
||||
return web.json_response(
|
||||
{"error": "'enabled_mcp_tools' must be null or a list of strings"},
|
||||
status=400,
|
||||
)
|
||||
|
||||
ensure_default_queens()
|
||||
try:
|
||||
load_queen_profile(queen_id)
|
||||
except FileNotFoundError:
|
||||
return web.json_response({"error": f"Queen '{queen_id}' not found"}, status=404)
|
||||
|
||||
# Validate names against the known MCP tool catalog. We prefer a live
|
||||
# session's registry for the most up-to-date set, then fall back to
|
||||
# the manager-level snapshot (building it on demand if absent).
|
||||
manager = request.app.get("manager")
|
||||
session = _live_queen_session(manager, queen_id) if manager is not None else None
|
||||
if session is not None:
|
||||
catalog = _catalog_from_live_session(session)
|
||||
else:
|
||||
catalog = await _ensure_manager_catalog(manager)
|
||||
known_names: set[str] = set()
|
||||
for entries in catalog.values():
|
||||
for entry in entries:
|
||||
if entry.get("name"):
|
||||
known_names.add(entry["name"])
|
||||
|
||||
if enabled is not None and known_names:
|
||||
unknown = sorted(set(enabled) - known_names)
|
||||
if unknown:
|
||||
return web.json_response(
|
||||
{"error": "Unknown MCP tool name(s)", "unknown": unknown},
|
||||
status=400,
|
||||
)
|
||||
|
||||
# Persist — tools.json sidecar, not profile.yaml.
|
||||
try:
|
||||
update_queen_tools_config(queen_id, enabled)
|
||||
except FileNotFoundError:
|
||||
return web.json_response({"error": f"Queen '{queen_id}' not found"}, status=404)
|
||||
|
||||
# Hot-reload every live DM session for this queen. The filter memo is
|
||||
# rebuilt so the very next turn sees the new allowlist without a
|
||||
# session restart, and the prompt cache is invalidated exactly once.
|
||||
refreshed = 0
|
||||
sessions = getattr(manager, "_sessions", None) or {}
|
||||
for sess in sessions.values():
|
||||
if getattr(sess, "queen_name", None) != queen_id:
|
||||
continue
|
||||
phase_state = getattr(sess, "phase_state", None)
|
||||
if phase_state is None:
|
||||
continue
|
||||
phase_state.enabled_mcp_tools = enabled
|
||||
rebuild = getattr(phase_state, "rebuild_independent_filter", None)
|
||||
if callable(rebuild):
|
||||
try:
|
||||
rebuild()
|
||||
refreshed += 1
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"Queen tools: rebuild_independent_filter failed for session %s",
|
||||
getattr(sess, "id", "?"),
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Queen tools: queen_id=%s allowlist=%s refreshed_sessions=%d",
|
||||
queen_id,
|
||||
"null" if enabled is None else f"{len(enabled)} tool(s)",
|
||||
refreshed,
|
||||
)
|
||||
return web.json_response(
|
||||
{
|
||||
"queen_id": queen_id,
|
||||
"enabled_mcp_tools": enabled,
|
||||
"refreshed_sessions": refreshed,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def handle_delete_tools(request: web.Request) -> web.Response:
|
||||
"""DELETE /api/queen/{queen_id}/tools — drop the sidecar, fall back to role defaults.
|
||||
|
||||
Users click "Reset to role default" in the Tool Library. That
|
||||
removes ``tools.json`` so the queen's effective allowlist becomes
|
||||
the role-based default (or allow-all if the queen has no role
|
||||
entry). Live sessions are refreshed so the next turn reflects the
|
||||
change without a restart.
|
||||
"""
|
||||
queen_id = request.match_info["queen_id"]
|
||||
ensure_default_queens()
|
||||
try:
|
||||
load_queen_profile(queen_id)
|
||||
except FileNotFoundError:
|
||||
return web.json_response({"error": f"Queen '{queen_id}' not found"}, status=404)
|
||||
|
||||
removed = delete_queen_tools_config(queen_id)
|
||||
|
||||
# Recompute the queen's effective allowlist from the role defaults
|
||||
# so we can hot-reload live sessions in one pass (same shape as
|
||||
# PATCH).
|
||||
manager = request.app.get("manager")
|
||||
session = _live_queen_session(manager, queen_id) if manager is not None else None
|
||||
if session is not None:
|
||||
catalog = _catalog_from_live_session(session)
|
||||
else:
|
||||
catalog = await _ensure_manager_catalog(manager)
|
||||
new_enabled = load_queen_tools_config(queen_id, mcp_catalog=catalog)
|
||||
|
||||
refreshed = 0
|
||||
sessions = getattr(manager, "_sessions", None) or {}
|
||||
for sess in sessions.values():
|
||||
if getattr(sess, "queen_name", None) != queen_id:
|
||||
continue
|
||||
phase_state = getattr(sess, "phase_state", None)
|
||||
if phase_state is None:
|
||||
continue
|
||||
phase_state.enabled_mcp_tools = new_enabled
|
||||
rebuild = getattr(phase_state, "rebuild_independent_filter", None)
|
||||
if callable(rebuild):
|
||||
try:
|
||||
rebuild()
|
||||
refreshed += 1
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"Queen tools: rebuild_independent_filter failed for session %s",
|
||||
getattr(sess, "id", "?"),
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Queen tools: queen_id=%s reset-to-default removed=%s refreshed_sessions=%d",
|
||||
queen_id,
|
||||
removed,
|
||||
refreshed,
|
||||
)
|
||||
return web.json_response(
|
||||
{
|
||||
"queen_id": queen_id,
|
||||
"removed": removed,
|
||||
"enabled_mcp_tools": new_enabled,
|
||||
"is_role_default": True,
|
||||
"refreshed_sessions": refreshed,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def register_routes(app: web.Application) -> None:
|
||||
"""Register queen-tools routes."""
|
||||
app.router.add_get("/api/queen/{queen_id}/tools", handle_get_tools)
|
||||
app.router.add_patch("/api/queen/{queen_id}/tools", handle_patch_tools)
|
||||
app.router.add_delete("/api/queen/{queen_id}/tools", handle_delete_tools)
|
||||
@@ -248,15 +248,22 @@ async def handle_queen_session(request: web.Request) -> web.Response:
|
||||
# Skip colony sessions: a colony forked from this queen also carries
|
||||
# queen_name == queen_id, but it has a worker loaded (colony_id /
|
||||
# worker_path set) and is the colony's chat, not the queen's DM.
|
||||
for session in manager.list_sessions():
|
||||
if session.queen_name == queen_id and session.colony_id is None and session.worker_path is None:
|
||||
return web.json_response(
|
||||
{
|
||||
"session_id": session.id,
|
||||
"queen_id": queen_id,
|
||||
"status": "live",
|
||||
}
|
||||
)
|
||||
# When multiple DM sessions for this queen are live at once (e.g. the
|
||||
# user created a new session, then navigated away and back), return
|
||||
# the most recently loaded one so we don't resurrect a stale older
|
||||
# session ahead of a freshly created one.
|
||||
live_matches = [
|
||||
s for s in manager.list_sessions() if s.queen_name == queen_id and s.colony_id is None and s.worker_path is None
|
||||
]
|
||||
if live_matches:
|
||||
latest = max(live_matches, key=lambda s: s.loaded_at)
|
||||
return web.json_response(
|
||||
{
|
||||
"session_id": latest.id,
|
||||
"queen_id": queen_id,
|
||||
"status": "live",
|
||||
}
|
||||
)
|
||||
|
||||
# 2. Find the most recent cold session for this queen and resume it.
|
||||
# IMPORTANT: skip sessions that don't belong in the queen DM:
|
||||
@@ -378,6 +385,8 @@ async def handle_select_queen_session(request: web.Request) -> web.Response:
|
||||
|
||||
async def handle_new_queen_session(request: web.Request) -> web.Response:
|
||||
"""POST /api/queen/{queen_id}/session/new -- create a fresh queen session."""
|
||||
from framework.tools.queen_lifecycle_tools import QUEEN_PHASES
|
||||
|
||||
queen_id = request.match_info["queen_id"]
|
||||
manager = request.app["manager"]
|
||||
|
||||
@@ -387,9 +396,25 @@ async def handle_new_queen_session(request: web.Request) -> web.Response:
|
||||
except FileNotFoundError:
|
||||
return web.json_response({"error": f"Queen '{queen_id}' not found"}, status=404)
|
||||
|
||||
body = await request.json() if request.can_read_body else {}
|
||||
if request.can_read_body:
|
||||
try:
|
||||
body = await request.json()
|
||||
except json.JSONDecodeError:
|
||||
return web.json_response({"error": "Invalid JSON body"}, status=400)
|
||||
if not isinstance(body, dict):
|
||||
return web.json_response({"error": "Request body must be a JSON object"}, status=400)
|
||||
else:
|
||||
body = {}
|
||||
initial_prompt = body.get("initial_prompt")
|
||||
initial_phase = body.get("initial_phase") or "independent"
|
||||
if initial_phase not in QUEEN_PHASES:
|
||||
return web.json_response(
|
||||
{
|
||||
"error": f"Invalid initial_phase '{initial_phase}'",
|
||||
"valid": sorted(QUEEN_PHASES),
|
||||
},
|
||||
status=400,
|
||||
)
|
||||
|
||||
session = await manager.create_session(
|
||||
initial_prompt=initial_prompt,
|
||||
|
||||
@@ -10,6 +10,7 @@ Session-primary routes:
|
||||
- GET /api/sessions/{session_id}/stats — runtime statistics
|
||||
- GET /api/sessions/{session_id}/entry-points — list entry points
|
||||
- PATCH /api/sessions/{session_id}/triggers/{id} — update trigger task
|
||||
- POST /api/sessions/{session_id}/triggers/{id}/run — fire trigger once (manual)
|
||||
- GET /api/sessions/{session_id}/colonies — list colony IDs
|
||||
- GET /api/sessions/{session_id}/events/history — persisted eventbus log (for replay)
|
||||
|
||||
@@ -63,6 +64,8 @@ def _session_to_live_dict(session) -> dict:
|
||||
"queen_supports_images": supports_image_tool_results(queen_model) if queen_model else True,
|
||||
"queen_id": getattr(phase_state, "queen_id", None) if phase_state else None,
|
||||
"queen_name": (phase_state.queen_profile or {}).get("name") if phase_state else None,
|
||||
"colony_spawned": getattr(session, "colony_spawned", False),
|
||||
"spawned_colony_name": getattr(session, "spawned_colony_name", None),
|
||||
}
|
||||
|
||||
|
||||
@@ -119,8 +122,19 @@ async def handle_create_session(request: web.Request) -> web.Response:
|
||||
(equivalent to the old POST /api/agents). Otherwise creates a queen-only
|
||||
session that can later have a colony loaded via POST /sessions/{id}/colony.
|
||||
"""
|
||||
from framework.agents.queen.queen_profiles import ensure_default_queens, load_queen_profile
|
||||
from framework.tools.queen_lifecycle_tools import QUEEN_PHASES
|
||||
|
||||
manager = _get_manager(request)
|
||||
body = await request.json() if request.can_read_body else {}
|
||||
if request.can_read_body:
|
||||
try:
|
||||
body = await request.json()
|
||||
except json.JSONDecodeError:
|
||||
return web.json_response({"error": "Invalid JSON body"}, status=400)
|
||||
if not isinstance(body, dict):
|
||||
return web.json_response({"error": "Request body must be a JSON object"}, status=400)
|
||||
else:
|
||||
body = {}
|
||||
agent_path = body.get("agent_path")
|
||||
agent_id = body.get("agent_id")
|
||||
session_id = body.get("session_id")
|
||||
@@ -131,6 +145,21 @@ async def handle_create_session(request: web.Request) -> web.Response:
|
||||
initial_phase = body.get("initial_phase")
|
||||
worker_name = body.get("worker_name")
|
||||
|
||||
if initial_phase is not None and initial_phase not in QUEEN_PHASES:
|
||||
return web.json_response(
|
||||
{
|
||||
"error": f"Invalid initial_phase '{initial_phase}'",
|
||||
"valid": sorted(QUEEN_PHASES),
|
||||
},
|
||||
status=400,
|
||||
)
|
||||
if queen_name:
|
||||
ensure_default_queens()
|
||||
try:
|
||||
load_queen_profile(queen_name)
|
||||
except FileNotFoundError:
|
||||
return web.json_response({"error": f"Queen '{queen_name}' not found"}, status=404)
|
||||
|
||||
if agent_path:
|
||||
try:
|
||||
agent_path = str(validate_agent_path(agent_path))
|
||||
@@ -157,6 +186,7 @@ async def handle_create_session(request: web.Request) -> web.Response:
|
||||
model=model,
|
||||
initial_prompt=initial_prompt,
|
||||
queen_resume_from=queen_resume_from,
|
||||
queen_name=queen_name,
|
||||
initial_phase=initial_phase,
|
||||
)
|
||||
except ValueError as e:
|
||||
@@ -245,7 +275,14 @@ async def handle_get_live_session(request: web.Request) -> web.Response:
|
||||
}
|
||||
mono = getattr(session, "trigger_next_fire", {}).get(t.id)
|
||||
if mono is not None:
|
||||
entry["next_fire_in"] = max(0.0, mono - time.monotonic())
|
||||
remaining = max(0.0, mono - time.monotonic())
|
||||
entry["next_fire_in"] = remaining
|
||||
entry["next_fire_at"] = int((time.time() + remaining) * 1000)
|
||||
stats = getattr(session, "trigger_fire_stats", {}).get(t.id)
|
||||
if stats:
|
||||
entry["fire_count"] = stats.get("fire_count", 0)
|
||||
if stats.get("last_fired_at") is not None:
|
||||
entry["last_fired_at"] = stats["last_fired_at"]
|
||||
data["entry_points"].append(entry)
|
||||
data["colonies"] = session.colony_runtime.list_graphs()
|
||||
|
||||
@@ -395,7 +432,14 @@ async def handle_session_entry_points(request: web.Request) -> web.Response:
|
||||
}
|
||||
mono = getattr(session, "trigger_next_fire", {}).get(t.id)
|
||||
if mono is not None:
|
||||
entry["next_fire_in"] = max(0.0, mono - time.monotonic())
|
||||
remaining = max(0.0, mono - time.monotonic())
|
||||
entry["next_fire_in"] = remaining
|
||||
entry["next_fire_at"] = int((time.time() + remaining) * 1000)
|
||||
stats = getattr(session, "trigger_fire_stats", {}).get(t.id)
|
||||
if stats:
|
||||
entry["fire_count"] = stats.get("fire_count", 0)
|
||||
if stats.get("last_fired_at") is not None:
|
||||
entry["last_fired_at"] = stats["last_fired_at"]
|
||||
entry_points.append(entry)
|
||||
return web.json_response({"entry_points": entry_points})
|
||||
|
||||
@@ -546,6 +590,60 @@ async def handle_update_trigger_task(request: web.Request) -> web.Response:
|
||||
)
|
||||
|
||||
|
||||
async def handle_run_trigger(request: web.Request) -> web.Response:
|
||||
"""POST /api/sessions/{session_id}/triggers/{trigger_id}/run — fire the trigger once.
|
||||
|
||||
Manual invocation for testing. Works whether the trigger is active or
|
||||
inactive; does not change active state and does not reset the scheduled
|
||||
next-fire time of an active timer.
|
||||
"""
|
||||
session, err = resolve_session(request)
|
||||
if err:
|
||||
return err
|
||||
|
||||
trigger_id = request.match_info["trigger_id"]
|
||||
tdef = getattr(session, "available_triggers", {}).get(trigger_id)
|
||||
if tdef is None:
|
||||
return web.json_response(
|
||||
{"error": f"Trigger '{trigger_id}' not found"},
|
||||
status=404,
|
||||
)
|
||||
|
||||
if getattr(session, "colony_runtime", None) is None:
|
||||
return web.json_response({"error": "Colony not loaded"}, status=409)
|
||||
|
||||
executor = getattr(session, "queen_executor", None)
|
||||
queen_node = getattr(executor, "node_registry", {}).get("queen") if executor else None
|
||||
if queen_node is None:
|
||||
return web.json_response({"error": "Queen not ready"}, status=409)
|
||||
|
||||
from framework.agent_loop.agent_loop import TriggerEvent
|
||||
|
||||
try:
|
||||
await queen_node.inject_trigger(
|
||||
TriggerEvent(
|
||||
trigger_type=tdef.trigger_type,
|
||||
source_id=trigger_id,
|
||||
payload={
|
||||
"task": tdef.task or "",
|
||||
"trigger_config": tdef.trigger_config,
|
||||
"forced": True,
|
||||
},
|
||||
)
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return web.json_response(
|
||||
{"error": f"Failed to fire trigger: {exc}"},
|
||||
status=500,
|
||||
)
|
||||
|
||||
from framework.tools.queen_lifecycle_tools import _emit_trigger_fired
|
||||
|
||||
await _emit_trigger_fired(session, trigger_id, tdef.trigger_type)
|
||||
|
||||
return web.json_response({"status": "fired", "trigger_id": trigger_id})
|
||||
|
||||
|
||||
async def handle_activate_trigger(request: web.Request) -> web.Response:
|
||||
"""POST /api/sessions/{session_id}/triggers/{trigger_id}/activate — start a trigger."""
|
||||
session, err = resolve_session(request)
|
||||
@@ -597,6 +695,17 @@ async def handle_activate_trigger(request: web.Request) -> web.Response:
|
||||
|
||||
runner = getattr(session, "runner", None)
|
||||
colony_entry = runner.graph.entry_node if runner else None
|
||||
config_out = dict(tdef.trigger_config)
|
||||
mono = getattr(session, "trigger_next_fire", {}).get(trigger_id)
|
||||
if mono is not None:
|
||||
remaining = max(0.0, mono - time.monotonic())
|
||||
config_out["next_fire_in"] = remaining
|
||||
config_out["next_fire_at"] = int((time.time() + remaining) * 1000)
|
||||
stats = getattr(session, "trigger_fire_stats", {}).get(trigger_id)
|
||||
if stats:
|
||||
config_out["fire_count"] = stats.get("fire_count", 0)
|
||||
if stats.get("last_fired_at") is not None:
|
||||
config_out["last_fired_at"] = stats["last_fired_at"]
|
||||
await bus.publish(
|
||||
AgentEvent(
|
||||
type=EventType.TRIGGER_ACTIVATED,
|
||||
@@ -604,7 +713,7 @@ async def handle_activate_trigger(request: web.Request) -> web.Response:
|
||||
data={
|
||||
"trigger_id": trigger_id,
|
||||
"trigger_type": tdef.trigger_type,
|
||||
"trigger_config": tdef.trigger_config,
|
||||
"trigger_config": config_out,
|
||||
"name": tdef.description or trigger_id,
|
||||
**({"entry_node": colony_entry} if colony_entry else {}),
|
||||
},
|
||||
@@ -689,6 +798,110 @@ async def handle_session_colonies(request: web.Request) -> web.Response:
|
||||
_EVENTS_HISTORY_DEFAULT_LIMIT = 2000
|
||||
_EVENTS_HISTORY_MAX_LIMIT = 10000
|
||||
|
||||
# Files at or below this size use the simple forward-scan path (cheap enough
|
||||
# that the seek-backward dance isn't worth it). Above this threshold we read
|
||||
# the tail directly from end-of-file so a 50 MB log doesn't have to be paged
|
||||
# through entirely just to surface the last 2000 lines.
|
||||
_EVENTS_HISTORY_REVERSE_TAIL_THRESHOLD_BYTES = 1 << 20 # 1 MB
|
||||
_EVENTS_HISTORY_REVERSE_TAIL_CHUNK_BYTES = 64 * 1024
|
||||
|
||||
|
||||
def _read_events_tail(events_path: Path, limit: int) -> tuple[list[dict], int, bool]:
|
||||
"""Read the tail of an append-only JSONL events log.
|
||||
|
||||
Returns ``(events, total, truncated)``. ``events`` is at most ``limit``
|
||||
lines, oldest-first. ``total`` is the total number of non-blank lines in
|
||||
the file (exact for the small-file path, exact for the large-file path
|
||||
too — we do a separate fast newline-count pass).
|
||||
|
||||
Two paths:
|
||||
- Small files (< ~1 MB): forward scan. Cheap; gives an exact total for
|
||||
free. Defers ``json.loads`` to the bounded deque so we never parse a
|
||||
line that's about to be dropped.
|
||||
- Large files: seek to EOF and read backward in 64 KB chunks until we have
|
||||
at least ``limit`` complete lines. Parses only the tail. ``total`` is
|
||||
counted by a separate forward byte-scan that just counts newlines —
|
||||
no JSON parse — so it stays cheap even for huge files.
|
||||
|
||||
Without these optimizations, mounting the chat for a long-running queen
|
||||
with a ~50 k-event log used to spend most of its time inside ``json.loads``
|
||||
on the server thread (and block the event loop while doing it).
|
||||
"""
|
||||
from collections import deque
|
||||
|
||||
file_size = events_path.stat().st_size
|
||||
|
||||
if file_size <= _EVENTS_HISTORY_REVERSE_TAIL_THRESHOLD_BYTES:
|
||||
tail_raw: deque[str] = deque(maxlen=limit)
|
||||
total = 0
|
||||
with open(events_path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
total += 1
|
||||
tail_raw.append(line)
|
||||
events: list[dict] = []
|
||||
for raw in tail_raw:
|
||||
try:
|
||||
events.append(json.loads(raw))
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
return events, total, total > len(events)
|
||||
|
||||
# Large-file path: read backward until we have enough lines.
|
||||
import os as _os
|
||||
|
||||
chunk_size = _EVENTS_HISTORY_REVERSE_TAIL_CHUNK_BYTES
|
||||
pieces: list[bytes] = []
|
||||
newline_count = 0
|
||||
with open(events_path, "rb") as fb:
|
||||
fb.seek(0, _os.SEEK_END)
|
||||
pos = fb.tell()
|
||||
while pos > 0 and newline_count <= limit:
|
||||
read_size = min(chunk_size, pos)
|
||||
pos -= read_size
|
||||
fb.seek(pos)
|
||||
chunk = fb.read(read_size)
|
||||
newline_count += chunk.count(b"\n")
|
||||
pieces.append(chunk)
|
||||
pieces.reverse()
|
||||
blob = b"".join(pieces)
|
||||
|
||||
# Drop the leading partial line unless we read from offset 0.
|
||||
raw_lines = blob.split(b"\n")
|
||||
if pos > 0 and raw_lines:
|
||||
raw_lines = raw_lines[1:]
|
||||
decoded = [ln.decode("utf-8", errors="replace").strip() for ln in raw_lines]
|
||||
decoded = [ln for ln in decoded if ln]
|
||||
if len(decoded) > limit:
|
||||
decoded = decoded[-limit:]
|
||||
|
||||
events = []
|
||||
for raw in decoded:
|
||||
try:
|
||||
events.append(json.loads(raw))
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Separate fast pass for total: count newlines only, no JSON parse.
|
||||
total = 0
|
||||
with open(events_path, "rb") as fb:
|
||||
while True:
|
||||
chunk = fb.read(1 << 20)
|
||||
if not chunk:
|
||||
break
|
||||
total += chunk.count(b"\n")
|
||||
# File may end without a trailing newline; if so, the last non-empty line
|
||||
# was missed. Count it.
|
||||
if file_size > 0:
|
||||
with open(events_path, "rb") as fb:
|
||||
fb.seek(-1, _os.SEEK_END)
|
||||
if fb.read(1) != b"\n":
|
||||
total += 1
|
||||
|
||||
return events, total, total > len(events)
|
||||
|
||||
|
||||
async def handle_session_events_history(request: web.Request) -> web.Response:
|
||||
"""GET /api/sessions/{session_id}/events/history — persisted eventbus log.
|
||||
@@ -718,6 +931,9 @@ async def handle_session_events_history(request: web.Request) -> web.Response:
|
||||
recent N events". Long-running colonies have produced files with 50k+
|
||||
events; before this cap, restoring on page-mount shipped the whole thing
|
||||
down the wire and blocked the UI for seconds.
|
||||
|
||||
The actual file read runs in a worker thread via ``asyncio.to_thread`` so
|
||||
it doesn't block the event loop while other requests are in flight.
|
||||
"""
|
||||
session_id = request.match_info["session_id"]
|
||||
|
||||
@@ -743,24 +959,8 @@ async def handle_session_events_history(request: web.Request) -> web.Response:
|
||||
}
|
||||
)
|
||||
|
||||
# Tail the file using a bounded deque — O(limit) memory regardless
|
||||
# of file size. No need to materialize the whole list only to slice it.
|
||||
from collections import deque
|
||||
|
||||
tail: deque[dict] = deque(maxlen=limit)
|
||||
total = 0
|
||||
try:
|
||||
with open(events_path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
evt = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
total += 1
|
||||
tail.append(evt)
|
||||
events, total, truncated = await asyncio.to_thread(_read_events_tail, events_path, limit)
|
||||
except OSError:
|
||||
return web.json_response(
|
||||
{
|
||||
@@ -773,14 +973,13 @@ async def handle_session_events_history(request: web.Request) -> web.Response:
|
||||
}
|
||||
)
|
||||
|
||||
events = list(tail)
|
||||
return web.json_response(
|
||||
{
|
||||
"events": events,
|
||||
"session_id": session_id,
|
||||
"total": total,
|
||||
"returned": len(events),
|
||||
"truncated": total > len(events),
|
||||
"truncated": truncated,
|
||||
"limit": limit,
|
||||
}
|
||||
)
|
||||
@@ -866,6 +1065,8 @@ async def handle_discover(request: web.Request) -> web.Response:
|
||||
"tool_count": entry.tool_count,
|
||||
"tags": entry.tags,
|
||||
"last_active": entry.last_active,
|
||||
"created_at": entry.created_at,
|
||||
"icon": entry.icon,
|
||||
"is_loaded": str(entry.path.resolve()) in loaded_paths,
|
||||
"workers": [w.to_dict() for w in entry.workers],
|
||||
}
|
||||
@@ -893,8 +1094,10 @@ async def handle_delete_agent(request: web.Request) -> web.Response:
|
||||
except ValueError as exc:
|
||||
return web.json_response({"error": str(exc)}, status=400)
|
||||
|
||||
# Reject deletion of framework agents (~/.hive/agents/) — those are internal
|
||||
hive_agents_dir = Path.home() / ".hive" / "agents"
|
||||
# Reject deletion of framework agents ($HIVE_HOME/agents/) — those are internal
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
hive_agents_dir = HIVE_HOME / "agents"
|
||||
if resolved.is_relative_to(hive_agents_dir):
|
||||
return web.json_response({"error": "Cannot delete framework agents"}, status=403)
|
||||
|
||||
@@ -946,6 +1149,40 @@ async def handle_reveal_session_folder(request: web.Request) -> web.Response:
|
||||
return web.json_response({"path": str(folder)})
|
||||
|
||||
|
||||
async def handle_update_colony_metadata(request: web.Request) -> web.Response:
|
||||
"""PATCH /api/agents/metadata — update colony metadata (e.g. icon).
|
||||
|
||||
Body: {"agent_path": "...", "icon": "rocket"}
|
||||
"""
|
||||
try:
|
||||
body = await request.json()
|
||||
except Exception:
|
||||
return web.json_response({"error": "Invalid JSON body"}, status=400)
|
||||
|
||||
agent_path = body.get("agent_path")
|
||||
if not agent_path:
|
||||
return web.json_response({"error": "agent_path is required"}, status=400)
|
||||
|
||||
try:
|
||||
resolved = validate_agent_path(agent_path)
|
||||
except ValueError as exc:
|
||||
return web.json_response({"error": str(exc)}, status=400)
|
||||
|
||||
metadata_path = resolved / "metadata.json"
|
||||
metadata: dict = {}
|
||||
if metadata_path.exists():
|
||||
try:
|
||||
metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if "icon" in body:
|
||||
metadata["icon"] = body["icon"]
|
||||
|
||||
metadata_path.write_text(json.dumps(metadata, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
return web.json_response({"ok": True})
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Route registration
|
||||
# ------------------------------------------------------------------
|
||||
@@ -956,6 +1193,7 @@ def register_routes(app: web.Application) -> None:
|
||||
# Discovery & agent management
|
||||
app.router.add_get("/api/discover", handle_discover)
|
||||
app.router.add_delete("/api/agents", handle_delete_agent)
|
||||
app.router.add_patch("/api/agents/metadata", handle_update_colony_metadata)
|
||||
|
||||
# Session lifecycle
|
||||
app.router.add_post("/api/sessions", handle_create_session)
|
||||
@@ -983,6 +1221,10 @@ def register_routes(app: web.Application) -> None:
|
||||
"/api/sessions/{session_id}/triggers/{trigger_id}/deactivate",
|
||||
handle_deactivate_trigger,
|
||||
)
|
||||
app.router.add_post(
|
||||
"/api/sessions/{session_id}/triggers/{trigger_id}/run",
|
||||
handle_run_trigger,
|
||||
)
|
||||
app.router.add_get("/api/sessions/{session_id}/colonies", handle_session_colonies)
|
||||
|
||||
app.router.add_get("/api/sessions/{session_id}/events/history", handle_session_events_history)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,112 @@
|
||||
"""REST routes for task lists.
|
||||
|
||||
GET /api/tasks/{task_list_id} -- snapshot of one list
|
||||
GET /api/colonies/{colony_id}/task_lists -- helper for colony view
|
||||
GET /api/sessions/{session_id}/task_list_id -- helper for session view
|
||||
|
||||
The task_list_id segment uses URL-encoded colons (``colony%3Aabc`` /
|
||||
``session%3Aagent%3Asess``); aiohttp decodes them automatically.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from aiohttp import web
|
||||
|
||||
from framework.tasks import get_task_store
|
||||
from framework.tasks.scoping import (
|
||||
colony_task_list_id,
|
||||
session_task_list_id,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def handle_get_task_list(request: web.Request) -> web.Response:
|
||||
raw = request.match_info.get("task_list_id", "")
|
||||
if not raw:
|
||||
return web.json_response({"error": "task_list_id required"}, status=400)
|
||||
|
||||
store = get_task_store()
|
||||
if not await store.list_exists(raw):
|
||||
return web.json_response(
|
||||
{"error": f"Task list {raw!r} not found", "task_list_id": raw, "tasks": []},
|
||||
status=404,
|
||||
)
|
||||
|
||||
meta = await store.get_meta(raw)
|
||||
records = await store.list_tasks(raw)
|
||||
return web.json_response(
|
||||
{
|
||||
"task_list_id": raw,
|
||||
"role": meta.role.value if meta else "session",
|
||||
"meta": meta.model_dump(mode="json") if meta else None,
|
||||
"tasks": [
|
||||
{
|
||||
"id": r.id,
|
||||
"subject": r.subject,
|
||||
"description": r.description,
|
||||
"active_form": r.active_form,
|
||||
"owner": r.owner,
|
||||
"status": r.status.value,
|
||||
"blocks": list(r.blocks),
|
||||
"blocked_by": list(r.blocked_by),
|
||||
"metadata": dict(r.metadata),
|
||||
"created_at": r.created_at,
|
||||
"updated_at": r.updated_at,
|
||||
}
|
||||
for r in records
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def handle_get_colony_task_lists(request: web.Request) -> web.Response:
|
||||
"""Return template_task_list_id and queen_session_task_list_id for a colony."""
|
||||
colony_id = request.match_info.get("colony_id", "")
|
||||
if not colony_id:
|
||||
return web.json_response({"error": "colony_id required"}, status=400)
|
||||
|
||||
template_id = colony_task_list_id(colony_id)
|
||||
# Queen's session list — the queen-of-colony's session_id == the
|
||||
# browser-facing colony session id. The frontend already knows that
|
||||
# value; we surface what we have on disk for completeness.
|
||||
queen_session_id = request.query.get("queen_session_id")
|
||||
queen_list_id = session_task_list_id("queen", queen_session_id) if queen_session_id else None
|
||||
return web.json_response(
|
||||
{
|
||||
"template_task_list_id": template_id,
|
||||
"queen_session_task_list_id": queen_list_id,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def handle_get_session_task_list_id(request: web.Request) -> web.Response:
|
||||
"""Return task_list_id and picked_up_from for a session.
|
||||
|
||||
The session_id is the queen's session id or a worker's session id;
|
||||
both follow the same path. The agent_id is read from the request query
|
||||
(passed by the frontend, which already knows which agent the session
|
||||
belongs to).
|
||||
"""
|
||||
session_id = request.match_info.get("session_id", "")
|
||||
agent_id = request.query.get("agent_id", "queen")
|
||||
if not session_id:
|
||||
return web.json_response({"error": "session_id required"}, status=400)
|
||||
|
||||
task_list_id = session_task_list_id(agent_id, session_id)
|
||||
store = get_task_store()
|
||||
exists = await store.list_exists(task_list_id)
|
||||
return web.json_response(
|
||||
{
|
||||
"task_list_id": task_list_id if exists else None,
|
||||
"picked_up_from": None,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def register_routes(app: web.Application) -> None:
|
||||
app.router.add_get("/api/tasks/{task_list_id}", handle_get_task_list)
|
||||
app.router.add_get("/api/colonies/{colony_id}/task_lists", handle_get_colony_task_lists)
|
||||
app.router.add_get("/api/sessions/{session_id}/task_list_id", handle_get_session_task_list_id)
|
||||
@@ -67,11 +67,9 @@ async def handle_list_nodes(request: web.Request) -> web.Response:
|
||||
worker_session_id = request.query.get("session_id")
|
||||
if worker_session_id and session.worker_path:
|
||||
worker_session_id = safe_path_segment(worker_session_id)
|
||||
from pathlib import Path
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
state_path = (
|
||||
Path.home() / ".hive" / "agents" / session.worker_path.name / "sessions" / worker_session_id / "state.json"
|
||||
)
|
||||
state_path = HIVE_HOME / "agents" / session.worker_path.name / "sessions" / worker_session_id / "state.json"
|
||||
if state_path.exists():
|
||||
try:
|
||||
state = json.loads(state_path.read_text(encoding="utf-8"))
|
||||
|
||||
@@ -19,7 +19,7 @@ from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
|
||||
from framework.config import QUEENS_DIR
|
||||
from framework.config import QUEENS_DIR, get_max_tokens
|
||||
from framework.host.triggers import TriggerDefinition
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -93,6 +93,9 @@ class Session:
|
||||
worker_configured: bool = False
|
||||
# Monotonic timestamps for next trigger fire (mirrors AgentRuntime._timer_next_fire)
|
||||
trigger_next_fire: dict[str, float] = field(default_factory=dict)
|
||||
# Per-trigger fire stats (session lifetime): {trigger_id: {"fire_count": int, "last_fired_at": epoch_ms}}.
|
||||
# Reset on process restart — good enough as a "since this session started" counter.
|
||||
trigger_fire_stats: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||
# Session directory resumption:
|
||||
# When set, _start_queen writes queen conversations to this existing session's
|
||||
# directory instead of creating a new one. This lets cold-restores accumulate
|
||||
@@ -111,6 +114,12 @@ class Session:
|
||||
# tool unlocked. The mode is the canonical discriminator for storage
|
||||
# path, tool exposure, and SSE filtering — see the Phase 2 plan.
|
||||
mode: Literal["dm", "colony"] = "dm"
|
||||
# Set to True after the user clicks the COLONY_CREATED system message
|
||||
# in this DM. Locks the chat input — the user must compact+fork into a
|
||||
# fresh session before continuing the conversation. Persisted in
|
||||
# meta.json so the lock survives server restarts.
|
||||
colony_spawned: bool = False
|
||||
spawned_colony_name: str | None = None
|
||||
|
||||
|
||||
class SessionManager:
|
||||
@@ -413,6 +422,27 @@ class SessionManager:
|
||||
if existing.worker_path and str(existing.worker_path) == str(agent_path):
|
||||
return existing
|
||||
|
||||
# When the queen forked this colony, the inherited DM transcript
|
||||
# is compacted in the background (see fork_session_into_colony).
|
||||
# Block here until that compactor finishes so _load_worker_core
|
||||
# reads the compacted summary — not the raw transcript (which
|
||||
# would defeat the fork's purpose). Bounded wait: on timeout we
|
||||
# proceed anyway so a stuck compactor can't brick the colony.
|
||||
if queen_resume_from:
|
||||
try:
|
||||
from framework.server import compaction_status
|
||||
|
||||
await compaction_status.await_completion(
|
||||
_find_queen_session_dir(queen_resume_from),
|
||||
timeout=180.0,
|
||||
)
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"await_compaction failed for %s — proceeding",
|
||||
queen_resume_from,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
session = await self._create_session_core(
|
||||
session_id=_colony_session_id or queen_resume_from,
|
||||
model=model,
|
||||
@@ -516,8 +546,10 @@ class SessionManager:
|
||||
session.colony_name = colony_id
|
||||
session.worker_path = agent_path
|
||||
|
||||
# Worker storage: ~/.hive/agents/{colony_name}/{worker_name}/
|
||||
worker_storage = Path.home() / ".hive" / "agents" / colony_id / worker_name
|
||||
# Worker storage: $HIVE_HOME/agents/{colony_name}/{worker_name}/
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
worker_storage = HIVE_HOME / "agents" / colony_id / worker_name
|
||||
worker_storage.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Copy conversations from colony if fresh
|
||||
@@ -668,7 +700,10 @@ class SessionManager:
|
||||
available_tools=all_tools,
|
||||
goal_context=goal.to_prompt_context(),
|
||||
goal=goal,
|
||||
max_tokens=8192,
|
||||
# Worker output cap — pull from configuration.json instead of
|
||||
# hard-coding 8192. glm-5.1/kimi-k2.5 both support 32k out, and
|
||||
# capping at 8k silently truncates long worker turns mid-tool.
|
||||
max_tokens=get_max_tokens(),
|
||||
stream_id=worker_name,
|
||||
execution_id=worker_name,
|
||||
identity_prompt=worker_data.get("identity_prompt", ""),
|
||||
@@ -897,7 +932,9 @@ class SessionManager:
|
||||
that process is still running on the host. If it is, the session is
|
||||
owned by another healthy worker process, so leave it alone.
|
||||
"""
|
||||
sessions_path = Path.home() / ".hive" / "agents" / agent_path.name / "sessions"
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
sessions_path = HIVE_HOME / "agents" / agent_path.name / "sessions"
|
||||
if not sessions_path.exists():
|
||||
return
|
||||
|
||||
@@ -1193,8 +1230,27 @@ class SessionManager:
|
||||
logger.info("Session '%s': shutdown reflection spawned", session_id)
|
||||
self._background_tasks.add(task)
|
||||
task.add_done_callback(self._background_tasks.discard)
|
||||
except Exception:
|
||||
logger.warning("Session '%s': failed to spawn shutdown reflection", session_id, exc_info=True)
|
||||
except RuntimeError as exc:
|
||||
# Most common when a session is stopped after the event loop
|
||||
# has closed (e.g. during server shutdown or from an atexit
|
||||
# handler). The reflection would have had nothing to write
|
||||
# anyway — no new turns since the last periodic reflection.
|
||||
logger.warning(
|
||||
"Session '%s': shutdown reflection skipped — event loop unavailable (%s). "
|
||||
"Normal during server shutdown; anything worth persisting was saved by the "
|
||||
"periodic reflection after the last turn.",
|
||||
session_id,
|
||||
exc,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"Session '%s': failed to spawn shutdown reflection: %s: %s. "
|
||||
"Check that queen_dir exists and session.llm is configured; full traceback follows.",
|
||||
session_id,
|
||||
type(exc).__name__,
|
||||
exc,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
if session.queen_task is not None:
|
||||
session.queen_task.cancel()
|
||||
@@ -1319,6 +1375,13 @@ class SessionManager:
|
||||
_new_meta["agent_path"] = str(session.worker_path)
|
||||
_existing_meta.update(_new_meta)
|
||||
_meta_path.write_text(json.dumps(_existing_meta), encoding="utf-8")
|
||||
# Hydrate colony-spawned lock state from meta.json so the lock
|
||||
# survives server restart / cold-resume into a live session.
|
||||
if _existing_meta.get("colony_spawned") is True:
|
||||
session.colony_spawned = True
|
||||
_spawned_name = _existing_meta.get("spawned_colony_name")
|
||||
if isinstance(_spawned_name, str):
|
||||
session.spawned_colony_name = _spawned_name
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
@@ -1479,8 +1542,46 @@ class SessionManager:
|
||||
tool_executor=queen_tool_executor,
|
||||
event_bus=session.event_bus,
|
||||
colony_id=session.id,
|
||||
# Wire the on-disk colony name and queen id so
|
||||
# ColonyRuntime auto-derives its override paths. DM sessions
|
||||
# have no colony_name (session.colony_name is None), which
|
||||
# keeps them out of the per-colony JSON store.
|
||||
colony_name=getattr(session, "colony_name", None),
|
||||
queen_id=getattr(session, "queen_name", None) or None,
|
||||
pipeline_stages=[], # queen pipeline runs in queen_orchestrator, not here
|
||||
)
|
||||
|
||||
# Per-colony tool allowlist, loaded from the colony's metadata.json
|
||||
# when this session is attached to a real forked colony. For pure
|
||||
# queen DM sessions (session.colony_name is None) we only capture
|
||||
# the MCP-origin set — the allowlist stays ``None`` so every MCP
|
||||
# tool passes through by default.
|
||||
try:
|
||||
mcp_tool_names_all: set[str] = set()
|
||||
mgr_catalog = getattr(self, "_mcp_tool_catalog", None)
|
||||
if isinstance(mgr_catalog, dict):
|
||||
for entries in mgr_catalog.values():
|
||||
for entry in entries:
|
||||
name = entry.get("name") if isinstance(entry, dict) else None
|
||||
if name:
|
||||
mcp_tool_names_all.add(name)
|
||||
enabled_mcp_tools: list[str] | None = None
|
||||
colony_name = getattr(session, "colony_name", None)
|
||||
if colony_name:
|
||||
# Colony tool allowlist lives in a dedicated tools.json
|
||||
# sidecar next to metadata.json. The helper migrates any
|
||||
# legacy field out of metadata.json on first read.
|
||||
from framework.host.colony_tools_config import load_colony_tools_config
|
||||
|
||||
enabled_mcp_tools = load_colony_tools_config(colony_name)
|
||||
colony.set_tool_allowlist(enabled_mcp_tools, mcp_tool_names_all)
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"Colony allowlist bootstrap failed for session %s",
|
||||
session.id,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
await colony.start()
|
||||
session.colony = colony
|
||||
|
||||
@@ -1573,8 +1674,28 @@ class SessionManager:
|
||||
# Resolve entry node for trigger target
|
||||
runner = getattr(session, "runner", None)
|
||||
colony_entry = runner.graph.entry_node if runner else None
|
||||
fire_times = getattr(session, "trigger_next_fire", {})
|
||||
fire_stats = getattr(session, "trigger_fire_stats", {})
|
||||
now_mono = time.monotonic()
|
||||
now_wall = time.time()
|
||||
|
||||
for t in triggers.values():
|
||||
# Merge ephemeral next-fire data + historical fire stats into
|
||||
# trigger_config so the UI can render a live-ticking countdown
|
||||
# and a "fired Nx · last 2m ago" badge. `next_fire_at` is epoch
|
||||
# milliseconds (wall clock) — the frontend anchors its ticker
|
||||
# on this. `next_fire_in` is kept for legacy consumers.
|
||||
config_out = dict(t.trigger_config)
|
||||
mono = fire_times.get(t.id)
|
||||
if mono is not None:
|
||||
remaining = max(0.0, mono - now_mono)
|
||||
config_out["next_fire_in"] = remaining
|
||||
config_out["next_fire_at"] = int((now_wall + remaining) * 1000)
|
||||
stats = fire_stats.get(t.id)
|
||||
if stats:
|
||||
config_out["fire_count"] = stats.get("fire_count", 0)
|
||||
if stats.get("last_fired_at") is not None:
|
||||
config_out["last_fired_at"] = stats["last_fired_at"]
|
||||
await session.event_bus.publish(
|
||||
AgentEvent(
|
||||
type=event_type,
|
||||
@@ -1582,7 +1703,7 @@ class SessionManager:
|
||||
data={
|
||||
"trigger_id": t.id,
|
||||
"trigger_type": t.trigger_type,
|
||||
"trigger_config": t.trigger_config,
|
||||
"trigger_config": config_out,
|
||||
"name": t.description or t.id,
|
||||
**({"entry_node": colony_entry} if colony_entry else {}),
|
||||
},
|
||||
@@ -1650,6 +1771,42 @@ class SessionManager:
|
||||
def list_sessions(self) -> list[Session]:
|
||||
return list(self._sessions.values())
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Skill override helpers — used by routes_skills to find every live
|
||||
# SkillsManager affected by a queen- or colony-scope mutation so a
|
||||
# single HTTP call can reload them all.
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def iter_queen_sessions(self, queen_id: str):
|
||||
"""Yield live sessions whose queen matches ``queen_id``."""
|
||||
for s in self._sessions.values():
|
||||
if getattr(s, "queen_name", None) == queen_id:
|
||||
yield s
|
||||
|
||||
def iter_colony_runtimes(
|
||||
self,
|
||||
*,
|
||||
queen_id: str | None = None,
|
||||
colony_name: str | None = None,
|
||||
):
|
||||
"""Yield live ``ColonyRuntime`` instances matching the filters.
|
||||
|
||||
``queen_id`` alone → every runtime whose ``queen_id`` matches
|
||||
(useful when the user toggles a queen-scope skill — all her
|
||||
colonies must reload). ``colony_name`` alone → the single
|
||||
runtime pinned to that colony. Both → intersection. No filters
|
||||
→ every live runtime (used by global ``/api/skills`` reload).
|
||||
"""
|
||||
for s in self._sessions.values():
|
||||
colony = getattr(s, "colony", None)
|
||||
if colony is None:
|
||||
continue
|
||||
if queen_id is not None and getattr(colony, "queen_id", None) != queen_id:
|
||||
continue
|
||||
if colony_name is not None and getattr(colony, "colony_name", None) != colony_name:
|
||||
continue
|
||||
yield colony
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Cold session helpers (disk-only, no live runtime required)
|
||||
# ------------------------------------------------------------------
|
||||
@@ -1694,6 +1851,8 @@ class SessionManager:
|
||||
# Read extra metadata written at session start
|
||||
agent_name: str | None = None
|
||||
agent_path: str | None = None
|
||||
colony_spawned: bool = False
|
||||
spawned_colony_name: str | None = None
|
||||
meta_path = queen_dir / "meta.json"
|
||||
if meta_path.exists():
|
||||
try:
|
||||
@@ -1701,6 +1860,10 @@ class SessionManager:
|
||||
agent_name = meta.get("agent_name")
|
||||
agent_path = meta.get("agent_path")
|
||||
created_at = meta.get("created_at") or created_at
|
||||
colony_spawned = bool(meta.get("colony_spawned"))
|
||||
_spawned = meta.get("spawned_colony_name")
|
||||
if isinstance(_spawned, str):
|
||||
spawned_colony_name = _spawned
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
|
||||
@@ -1712,6 +1875,8 @@ class SessionManager:
|
||||
"created_at": created_at,
|
||||
"agent_name": agent_name,
|
||||
"agent_path": agent_path,
|
||||
"colony_spawned": colony_spawned,
|
||||
"spawned_colony_name": spawned_colony_name,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
@@ -1760,73 +1925,38 @@ class SessionManager:
|
||||
if meta.get("colony_fork"):
|
||||
continue
|
||||
|
||||
# Build a quick preview of the last human/assistant exchange.
|
||||
# We read all conversation parts, filter to client-facing messages,
|
||||
# and return the last assistant message content as a snippet.
|
||||
# Preview of the last client-facing exchange. Cached in
|
||||
# ``summary.json`` next to ``meta.json`` so the sidebar doesn't
|
||||
# have to rescan every part on each list call. The cache is
|
||||
# written incrementally by FileConversationStore.write_part; if
|
||||
# missing or stale (parts dir mtime newer than the summary file)
|
||||
# we do a one-time full rebuild and write a fresh summary.
|
||||
#
|
||||
# NOTE on activity timestamps: the session directory's own mtime
|
||||
# is NOT reliable as a "last activity" marker — POSIX dir mtime
|
||||
# only updates when direct entries change, and conversation
|
||||
# parts live under conversations/parts/, so writing a new part
|
||||
# does not bubble up to the session dir.
|
||||
from framework.storage import session_summary
|
||||
|
||||
last_message: str | None = None
|
||||
message_count: int = 0
|
||||
# Last-activity timestamp — mtime of the latest client-facing message.
|
||||
# Falls back to session creation time for empty sessions. NOTE: the
|
||||
# session directory's own mtime is NOT reliable here — POSIX dir mtime
|
||||
# only updates when direct entries change, and conversation parts are
|
||||
# nested under conversations/parts/, so writing a new part does not
|
||||
# bubble up to the session dir.
|
||||
last_active_at: float = float(created_at) if isinstance(created_at, (int, float)) else 0.0
|
||||
convs_dir = d / "conversations"
|
||||
|
||||
summary: dict | None = None
|
||||
if convs_dir.exists():
|
||||
try:
|
||||
all_parts: list[dict] = []
|
||||
if session_summary.is_stale(d):
|
||||
summary = session_summary.rebuild_summary(d)
|
||||
else:
|
||||
summary = session_summary.read_summary(d)
|
||||
|
||||
def _collect_parts(parts_dir: Path, _dest: list[dict] = all_parts) -> None:
|
||||
if not parts_dir.exists():
|
||||
return
|
||||
for part_file in sorted(parts_dir.iterdir()):
|
||||
if part_file.suffix != ".json":
|
||||
continue
|
||||
try:
|
||||
part = json.loads(part_file.read_text(encoding="utf-8"))
|
||||
part.setdefault("created_at", part_file.stat().st_mtime)
|
||||
_dest.append(part)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
continue
|
||||
|
||||
# Flat layout: conversations/parts/*.json
|
||||
_collect_parts(convs_dir / "parts")
|
||||
# Node-based layout: conversations/<node_id>/parts/*.json
|
||||
for node_dir in convs_dir.iterdir():
|
||||
if not node_dir.is_dir() or node_dir.name == "parts":
|
||||
continue
|
||||
_collect_parts(node_dir / "parts")
|
||||
# Filter to client-facing messages only
|
||||
client_msgs = [
|
||||
p
|
||||
for p in all_parts
|
||||
if not p.get("is_transition_marker")
|
||||
and p.get("role") != "tool"
|
||||
and not (p.get("role") == "assistant" and p.get("tool_calls"))
|
||||
]
|
||||
client_msgs.sort(key=lambda m: m.get("created_at", m.get("seq", 0)))
|
||||
message_count = len(client_msgs)
|
||||
# Take the latest message's timestamp as the activity marker.
|
||||
# _collect_parts sets created_at via setdefault to the part
|
||||
# file's mtime, so this is always a valid float.
|
||||
if client_msgs:
|
||||
latest_ts = client_msgs[-1].get("created_at")
|
||||
if isinstance(latest_ts, (int, float)) and latest_ts > last_active_at:
|
||||
last_active_at = float(latest_ts)
|
||||
# Last assistant message as preview snippet
|
||||
for msg in reversed(client_msgs):
|
||||
content = msg.get("content") or ""
|
||||
if isinstance(content, list):
|
||||
# Anthropic-style content blocks
|
||||
content = " ".join(
|
||||
b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text"
|
||||
)
|
||||
if content and msg.get("role") == "assistant":
|
||||
last_message = content[:120].strip()
|
||||
break
|
||||
except OSError:
|
||||
pass
|
||||
if summary is not None:
|
||||
message_count = int(summary.get("message_count") or 0)
|
||||
last_message = summary.get("last_message")
|
||||
cached_active = summary.get("last_active_at")
|
||||
if isinstance(cached_active, (int, float)) and cached_active > last_active_at:
|
||||
last_active_at = float(cached_active)
|
||||
|
||||
# Derive queen_id from directory structure: queens/{queen_id}/sessions/{session_id}
|
||||
queen_id = d.parent.parent.name if d.parent.name == "sessions" else None
|
||||
|
||||
@@ -1516,6 +1516,65 @@ class TestCredentials:
|
||||
data = await resp.json()
|
||||
assert data["credentials"] == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_list_credentials_skips_unreadable_encrypted_entry(self):
|
||||
from pydantic import SecretStr
|
||||
|
||||
from framework.credentials.models import CredentialDecryptionError, CredentialKey, CredentialObject
|
||||
|
||||
class BrokenStore:
|
||||
def list_credentials(self):
|
||||
return ["good_cred", "bad_cred"]
|
||||
|
||||
def get_credential(self, credential_id, refresh_if_needed=False):
|
||||
if credential_id == "bad_cred":
|
||||
raise CredentialDecryptionError("bad encrypted file")
|
||||
return CredentialObject(
|
||||
id=credential_id,
|
||||
keys={"api_key": CredentialKey(name="api_key", value=SecretStr("secret"))},
|
||||
)
|
||||
|
||||
app = create_app()
|
||||
app["credential_store"] = BrokenStore()
|
||||
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.get("/api/credentials")
|
||||
assert resp.status == 200
|
||||
data = await resp.json()
|
||||
|
||||
assert [c["credential_id"] for c in data["credentials"]] == ["good_cred"]
|
||||
assert data["unreadable_credentials"] == ["bad_cred"]
|
||||
assert "secret" not in json.dumps(data)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_credential_unreadable_returns_recoverable_conflict(self):
|
||||
from framework.credentials.models import CredentialDecryptionError
|
||||
|
||||
class BrokenStore:
|
||||
def get_credential(self, credential_id, refresh_if_needed=False):
|
||||
raise CredentialDecryptionError("bad encrypted file")
|
||||
|
||||
app = create_app()
|
||||
app["credential_store"] = BrokenStore()
|
||||
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.get("/api/credentials/bad_cred")
|
||||
data = await resp.json()
|
||||
|
||||
assert resp.status == 409
|
||||
assert data["credential_id"] == "bad_cred"
|
||||
assert data["recoverable"] is True
|
||||
|
||||
def test_specs_availability_treats_decryption_error_as_unavailable(self):
|
||||
from framework.credentials.models import CredentialDecryptionError
|
||||
from framework.server.routes_credentials import _is_available_for_specs
|
||||
|
||||
class BrokenStore:
|
||||
def is_available(self, credential_id):
|
||||
raise CredentialDecryptionError("bad encrypted file")
|
||||
|
||||
assert _is_available_for_specs(BrokenStore(), "exa_search") is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_save_and_list_credential(self):
|
||||
app = self._make_app()
|
||||
|
||||
@@ -0,0 +1,425 @@
|
||||
"""Tests for POST /api/colonies/import — tar-based colony onboarding.
|
||||
|
||||
The handler resolves writes against ``framework.config.COLONIES_DIR``;
|
||||
every test redirects that into a ``tmp_path`` so we never touch the real
|
||||
``~/.hive/colonies`` tree.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import tarfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from aiohttp import FormData, web
|
||||
from aiohttp.test_utils import TestClient, TestServer
|
||||
|
||||
from framework.server import routes_colonies
|
||||
|
||||
|
||||
def _build_tar(layout: dict[str, bytes | None], *, gzip: bool = True) -> bytes:
|
||||
"""Build an in-memory tar with the given paths.
|
||||
|
||||
``layout`` maps archive member names to file contents; passing ``None``
|
||||
creates a directory entry instead of a regular file.
|
||||
"""
|
||||
buf = io.BytesIO()
|
||||
mode = "w:gz" if gzip else "w"
|
||||
with tarfile.open(fileobj=buf, mode=mode) as tf:
|
||||
for name, content in layout.items():
|
||||
if content is None:
|
||||
info = tarfile.TarInfo(name=name)
|
||||
info.type = tarfile.DIRTYPE
|
||||
info.mode = 0o755
|
||||
tf.addfile(info)
|
||||
else:
|
||||
info = tarfile.TarInfo(name=name)
|
||||
info.size = len(content)
|
||||
info.mode = 0o644
|
||||
tf.addfile(info, io.BytesIO(content))
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def _build_tar_with_symlink(top: str, link_name: str, link_target: str) -> bytes:
|
||||
buf = io.BytesIO()
|
||||
with tarfile.open(fileobj=buf, mode="w:gz") as tf:
|
||||
info = tarfile.TarInfo(name=top)
|
||||
info.type = tarfile.DIRTYPE
|
||||
info.mode = 0o755
|
||||
tf.addfile(info)
|
||||
sym = tarfile.TarInfo(name=f"{top}/{link_name}")
|
||||
sym.type = tarfile.SYMTYPE
|
||||
sym.linkname = link_target
|
||||
tf.addfile(sym)
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def colonies_dir(tmp_path, monkeypatch):
|
||||
"""Redirect COLONIES_DIR into a tmp tree."""
|
||||
colonies = tmp_path / "colonies"
|
||||
colonies.mkdir()
|
||||
monkeypatch.setattr(routes_colonies, "COLONIES_DIR", colonies)
|
||||
return colonies
|
||||
|
||||
|
||||
async def _client(app: web.Application) -> TestClient:
|
||||
return TestClient(TestServer(app))
|
||||
|
||||
|
||||
def _app() -> web.Application:
|
||||
app = web.Application()
|
||||
routes_colonies.register_routes(app)
|
||||
return app
|
||||
|
||||
|
||||
def _form(file_bytes: bytes, *, filename: str = "colony.tar.gz", **fields: str) -> FormData:
|
||||
fd = FormData()
|
||||
fd.add_field("file", file_bytes, filename=filename, content_type="application/gzip")
|
||||
for k, v in fields.items():
|
||||
fd.add_field(k, v)
|
||||
return fd
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_happy_path_imports_colony(colonies_dir: Path) -> None:
|
||||
archive = _build_tar(
|
||||
{
|
||||
"x_daily/": None,
|
||||
"x_daily/metadata.json": b'{"colony_name":"x_daily"}',
|
||||
"x_daily/scripts/run.sh": b"#!/bin/sh\necho hi\n",
|
||||
}
|
||||
)
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(archive))
|
||||
assert resp.status == 201, await resp.text()
|
||||
body = await resp.json()
|
||||
assert body["name"] == "x_daily"
|
||||
assert body["files_imported"] == 2
|
||||
assert (colonies_dir / "x_daily" / "metadata.json").read_bytes() == b'{"colony_name":"x_daily"}'
|
||||
assert (colonies_dir / "x_daily" / "scripts" / "run.sh").exists()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_name_override(colonies_dir: Path) -> None:
|
||||
archive = _build_tar({"x_daily/": None, "x_daily/file.txt": b"hi"})
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(archive, name="other_name"))
|
||||
assert resp.status == 201
|
||||
body = await resp.json()
|
||||
assert body["name"] == "other_name"
|
||||
assert (colonies_dir / "other_name" / "file.txt").read_bytes() == b"hi"
|
||||
assert not (colonies_dir / "x_daily").exists()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_rejects_existing_without_replace_flag(colonies_dir: Path) -> None:
|
||||
(colonies_dir / "x_daily").mkdir()
|
||||
(colonies_dir / "x_daily" / "old.txt").write_text("preserved")
|
||||
archive = _build_tar({"x_daily/": None, "x_daily/new.txt": b"new"})
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(archive))
|
||||
assert resp.status == 409
|
||||
# Original content untouched
|
||||
assert (colonies_dir / "x_daily" / "old.txt").read_text() == "preserved"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_replace_existing_overwrites(colonies_dir: Path) -> None:
|
||||
(colonies_dir / "x_daily").mkdir()
|
||||
(colonies_dir / "x_daily" / "old.txt").write_text("preserved")
|
||||
archive = _build_tar({"x_daily/": None, "x_daily/new.txt": b"new"})
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post(
|
||||
"/api/colonies/import",
|
||||
data=_form(archive, replace_existing="true"),
|
||||
)
|
||||
assert resp.status == 201, await resp.text()
|
||||
assert not (colonies_dir / "x_daily" / "old.txt").exists()
|
||||
assert (colonies_dir / "x_daily" / "new.txt").read_text() == "new"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_rejects_path_traversal(colonies_dir: Path) -> None:
|
||||
archive = _build_tar(
|
||||
{
|
||||
"x_daily/": None,
|
||||
"x_daily/../escape.txt": b"oops",
|
||||
}
|
||||
)
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(archive))
|
||||
assert resp.status == 400
|
||||
assert "traversal" in (await resp.json())["error"].lower() or "outside" in (await resp.json())["error"].lower()
|
||||
assert not (colonies_dir / "x_daily").exists()
|
||||
assert not (colonies_dir.parent / "escape.txt").exists()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_rejects_absolute_member(colonies_dir: Path) -> None:
|
||||
archive = _build_tar({"x_daily/": None, "/etc/passwd": b"oops"})
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(archive))
|
||||
assert resp.status == 400
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_rejects_symlinks(colonies_dir: Path) -> None:
|
||||
archive = _build_tar_with_symlink("x_daily", "evil", "/etc/passwd")
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(archive))
|
||||
assert resp.status == 400
|
||||
assert "symlink" in (await resp.json())["error"].lower()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_rejects_multiple_top_level_dirs(colonies_dir: Path) -> None:
|
||||
archive = _build_tar(
|
||||
{
|
||||
"a/": None,
|
||||
"a/x.txt": b"a",
|
||||
"b/": None,
|
||||
"b/y.txt": b"b",
|
||||
}
|
||||
)
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(archive))
|
||||
assert resp.status == 400
|
||||
assert "top-level" in (await resp.json())["error"].lower()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_rejects_invalid_colony_name(colonies_dir: Path) -> None:
|
||||
archive = _build_tar({"Bad-Name/": None, "Bad-Name/x.txt": b"x"})
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(archive))
|
||||
assert resp.status == 400
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_rejects_non_multipart(colonies_dir: Path) -> None:
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post(
|
||||
"/api/colonies/import", data=b"not multipart", headers={"Content-Type": "application/octet-stream"}
|
||||
)
|
||||
assert resp.status == 400
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_rejects_corrupt_tar(colonies_dir: Path) -> None:
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(b"not a real tar"))
|
||||
assert resp.status == 400
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_rejects_missing_file_part(colonies_dir: Path) -> None:
|
||||
fd = FormData()
|
||||
fd.add_field("name", "anything")
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=fd)
|
||||
assert resp.status == 400
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_accepts_uncompressed_tar(colonies_dir: Path) -> None:
|
||||
archive = _build_tar({"x_daily/": None, "x_daily/file.txt": b"plain"}, gzip=False)
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post(
|
||||
"/api/colonies/import",
|
||||
data=_form(archive, filename="colony.tar"),
|
||||
)
|
||||
assert resp.status == 201
|
||||
assert (colonies_dir / "x_daily" / "file.txt").read_text() == "plain"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Multi-root tar tests — the desktop's pushColonyToWorkspace ships the colony
|
||||
# dir + worker conversations + the queen's forked session in one tar so the
|
||||
# queen has full context on resume. Each recognised top-level prefix unpacks
|
||||
# into its corresponding HIVE_HOME subtree.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_root_unpacks_three_subtrees(colonies_dir: Path) -> None:
|
||||
archive = _build_tar(
|
||||
{
|
||||
"colonies/x_daily/": None,
|
||||
"colonies/x_daily/metadata.json": b'{"queen_session_id":"session_x"}',
|
||||
"colonies/x_daily/data/progress.db": b"sqlite",
|
||||
"agents/x_daily/worker/": None,
|
||||
"agents/x_daily/worker/conversations/": None,
|
||||
"agents/x_daily/worker/conversations/0001.json": b'{"role":"user"}',
|
||||
"agents/x_daily/worker/conversations/0002.json": b'{"role":"assistant"}',
|
||||
"agents/queens/queen_alpha/sessions/session_x/": None,
|
||||
"agents/queens/queen_alpha/sessions/session_x/queen.json": b'{"id":"x"}',
|
||||
}
|
||||
)
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(archive))
|
||||
assert resp.status == 201, await resp.text()
|
||||
body = await resp.json()
|
||||
# Colony files
|
||||
assert (colonies_dir / "x_daily" / "metadata.json").exists()
|
||||
assert (colonies_dir / "x_daily" / "data" / "progress.db").exists()
|
||||
# Worker conversations under HIVE_HOME/agents/<colony>/worker/
|
||||
hive_home = colonies_dir.parent
|
||||
assert (
|
||||
hive_home / "agents" / "x_daily" / "worker" / "conversations" / "0001.json"
|
||||
).read_bytes() == b'{"role":"user"}'
|
||||
# Queen forked session under HIVE_HOME/agents/queens/<queen>/sessions/<sid>/
|
||||
assert (hive_home / "agents" / "queens" / "queen_alpha" / "sessions" / "session_x" / "queen.json").exists()
|
||||
# Summary in response
|
||||
assert body["name"] == "x_daily"
|
||||
assert body["files_imported"] == 5
|
||||
by_root = body["by_root"]
|
||||
assert by_root["colonies"]["files"] == 2
|
||||
assert by_root["agents_worker"]["files"] == 2
|
||||
assert by_root["agents_queen"]["files"] == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_root_colonies_only_succeeds(colonies_dir: Path) -> None:
|
||||
"""The agents/ subtrees are optional — a fresh colony has no history."""
|
||||
archive = _build_tar(
|
||||
{
|
||||
"colonies/x_daily/": None,
|
||||
"colonies/x_daily/metadata.json": b"{}",
|
||||
}
|
||||
)
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(archive))
|
||||
assert resp.status == 201, await resp.text()
|
||||
body = await resp.json()
|
||||
assert body["files_imported"] == 1
|
||||
assert (colonies_dir / "x_daily" / "metadata.json").read_bytes() == b"{}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_root_rejects_missing_colonies_root(colonies_dir: Path) -> None:
|
||||
"""Worker / queen trees alone aren't valid — every push must include
|
||||
the colony dir, otherwise the desktop's intent is unclear and we
|
||||
refuse rather than silently leave HIVE_HOME in a half-state."""
|
||||
archive = _build_tar(
|
||||
{
|
||||
"agents/x_daily/worker/": None,
|
||||
"agents/x_daily/worker/log.json": b"{}",
|
||||
}
|
||||
)
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(archive))
|
||||
assert resp.status == 400, await resp.text()
|
||||
err = (await resp.json())["error"]
|
||||
assert "colonies/" in err
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_root_replace_existing_colony(colonies_dir: Path) -> None:
|
||||
(colonies_dir / "x_daily").mkdir()
|
||||
(colonies_dir / "x_daily" / "old.txt").write_text("preserved")
|
||||
archive = _build_tar(
|
||||
{
|
||||
"colonies/x_daily/": None,
|
||||
"colonies/x_daily/new.txt": b"new",
|
||||
}
|
||||
)
|
||||
# Without flag → 409
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(archive))
|
||||
assert resp.status == 409
|
||||
assert (colonies_dir / "x_daily" / "old.txt").read_text() == "preserved"
|
||||
# With flag → wipes + replaces
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post(
|
||||
"/api/colonies/import",
|
||||
data=_form(archive, replace_existing="true"),
|
||||
)
|
||||
assert resp.status == 201, await resp.text()
|
||||
assert not (colonies_dir / "x_daily" / "old.txt").exists()
|
||||
assert (colonies_dir / "x_daily" / "new.txt").read_text() == "new"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_root_rejects_traversal_in_worker_subtree(colonies_dir: Path) -> None:
|
||||
archive = _build_tar(
|
||||
{
|
||||
"colonies/x_daily/": None,
|
||||
"colonies/x_daily/m.json": b"{}",
|
||||
"agents/x_daily/worker/": None,
|
||||
"agents/x_daily/worker/../escape.txt": b"oops",
|
||||
}
|
||||
)
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(archive))
|
||||
assert resp.status == 400
|
||||
hive_home = colonies_dir.parent
|
||||
assert not (hive_home / "agents" / "escape.txt").exists()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_root_rejects_unknown_prefix(colonies_dir: Path) -> None:
|
||||
archive = _build_tar(
|
||||
{
|
||||
"colonies/x_daily/": None,
|
||||
"colonies/x_daily/m.json": b"{}",
|
||||
"etc/passwd": b"oops",
|
||||
}
|
||||
)
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(archive))
|
||||
# The unknown root is silently ignored (it doesn't match any
|
||||
# recognised prefix); the colony root is required and present, so
|
||||
# extraction succeeds and only the colonies subtree lands. We don't
|
||||
# write outside HIVE_HOME because the dispatcher only routes to
|
||||
# known destinations.
|
||||
assert resp.status == 201, await resp.text()
|
||||
hive_home = colonies_dir.parent
|
||||
assert not (hive_home.parent / "etc" / "passwd").exists()
|
||||
assert not (hive_home / "etc" / "passwd").exists()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_root_rejects_invalid_segment(colonies_dir: Path) -> None:
|
||||
archive = _build_tar(
|
||||
{
|
||||
"colonies/x_daily/": None,
|
||||
"colonies/x_daily/m.json": b"{}",
|
||||
"agents/queens/Bad-Queen/sessions/sess_1/": None,
|
||||
"agents/queens/Bad-Queen/sessions/sess_1/x.json": b"{}",
|
||||
}
|
||||
)
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post("/api/colonies/import", data=_form(archive))
|
||||
assert resp.status == 400
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_root_overwrites_agents_subtree_in_place(colonies_dir: Path) -> None:
|
||||
"""Worker/queen subtrees are append-mostly stores — the import handler
|
||||
extracts in place without an existence-conflict gate so the desktop can
|
||||
re-push from another machine without explicit overwrite."""
|
||||
hive_home = colonies_dir.parent
|
||||
worker_dir = hive_home / "agents" / "x_daily" / "worker" / "conversations"
|
||||
worker_dir.mkdir(parents=True)
|
||||
(worker_dir / "0000_old.json").write_text("old")
|
||||
archive = _build_tar(
|
||||
{
|
||||
"colonies/x_daily/": None,
|
||||
"colonies/x_daily/m.json": b"{}",
|
||||
"agents/x_daily/worker/": None,
|
||||
"agents/x_daily/worker/conversations/": None,
|
||||
"agents/x_daily/worker/conversations/0001_new.json": b"new",
|
||||
}
|
||||
)
|
||||
async with await _client(_app()) as c:
|
||||
resp = await c.post(
|
||||
"/api/colonies/import",
|
||||
data=_form(archive, replace_existing="true"),
|
||||
)
|
||||
assert resp.status == 201, await resp.text()
|
||||
# Old conversation file untouched (extraction is additive on agents/),
|
||||
# new one written.
|
||||
assert (worker_dir / "0000_old.json").read_text() == "old"
|
||||
assert (worker_dir / "0001_new.json").read_text() == "new"
|
||||
@@ -0,0 +1,300 @@
|
||||
"""Tests for the per-colony MCP tool allowlist filter + routes.
|
||||
|
||||
Covers:
|
||||
1. ``ColonyRuntime`` filter semantics (default-allow, allowlist, empty,
|
||||
lifecycle passes through).
|
||||
2. routes_colony_tools round trip (GET/PATCH, validation, 404).
|
||||
3. Colony index route for the Tool Library picker.
|
||||
|
||||
Routes never touch the real ``~/.hive/colonies`` tree — we redirect
|
||||
``COLONIES_DIR`` into ``tmp_path`` via monkeypatch.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from aiohttp import web
|
||||
from aiohttp.test_utils import TestClient, TestServer
|
||||
|
||||
from framework.host.colony_runtime import ColonyRuntime
|
||||
from framework.llm.provider import Tool
|
||||
from framework.server import routes_colony_tools
|
||||
|
||||
|
||||
def _tool(name: str) -> Tool:
|
||||
return Tool(name=name, description=f"desc of {name}", parameters={"type": "object"})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ColonyRuntime filter unit tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _bare_runtime() -> ColonyRuntime:
|
||||
rt = ColonyRuntime.__new__(ColonyRuntime)
|
||||
rt._enabled_mcp_tools = None
|
||||
rt._mcp_tool_names_all = set()
|
||||
return rt
|
||||
|
||||
|
||||
class TestColonyFilter:
|
||||
def test_default_is_noop(self):
|
||||
rt = _bare_runtime()
|
||||
tools = [_tool("mcp_a"), _tool("lc_b")]
|
||||
assert rt._apply_tool_allowlist(tools) == tools
|
||||
|
||||
def test_allowlist_gates_mcp_only(self):
|
||||
rt = _bare_runtime()
|
||||
rt._mcp_tool_names_all = {"mcp_a", "mcp_b"}
|
||||
rt._enabled_mcp_tools = ["mcp_a"]
|
||||
tools = [_tool("mcp_a"), _tool("mcp_b"), _tool("lc_c")]
|
||||
names = [t.name for t in rt._apply_tool_allowlist(tools)]
|
||||
assert names == ["mcp_a", "lc_c"]
|
||||
|
||||
def test_empty_allowlist_keeps_lifecycle(self):
|
||||
rt = _bare_runtime()
|
||||
rt._mcp_tool_names_all = {"mcp_a", "mcp_b"}
|
||||
rt._enabled_mcp_tools = []
|
||||
tools = [_tool("mcp_a"), _tool("mcp_b"), _tool("lc_c")]
|
||||
names = [t.name for t in rt._apply_tool_allowlist(tools)]
|
||||
assert names == ["lc_c"]
|
||||
|
||||
def test_setter_mutates_live_state(self):
|
||||
rt = _bare_runtime()
|
||||
rt.set_tool_allowlist(["x"], {"x", "y"})
|
||||
assert rt._enabled_mcp_tools == ["x"]
|
||||
assert rt._mcp_tool_names_all == {"x", "y"}
|
||||
|
||||
# Passing None on allowlist clears gating; mcp_tool_names_all
|
||||
# defaults to "keep current" so a subsequent caller doesn't need
|
||||
# to repeat the set.
|
||||
rt.set_tool_allowlist(None)
|
||||
assert rt._enabled_mcp_tools is None
|
||||
assert rt._mcp_tool_names_all == {"x", "y"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Route round-trip tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class _FakeSession:
|
||||
colony_name: str
|
||||
colony: Any = None
|
||||
colony_runtime: Any = None
|
||||
id: str = "sess-1"
|
||||
|
||||
|
||||
@dataclass
|
||||
class _FakeManager:
|
||||
_sessions: dict = field(default_factory=dict)
|
||||
_mcp_tool_catalog: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def colony_dir(tmp_path, monkeypatch):
|
||||
"""Point COLONIES_DIR into a tmp tree and seed a colony."""
|
||||
colonies = tmp_path / "colonies"
|
||||
colonies.mkdir()
|
||||
monkeypatch.setattr("framework.host.colony_metadata.COLONIES_DIR", colonies)
|
||||
monkeypatch.setattr("framework.host.colony_tools_config.COLONIES_DIR", colonies)
|
||||
|
||||
name = "my_colony"
|
||||
cdir = colonies / name
|
||||
cdir.mkdir()
|
||||
(cdir / "metadata.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"colony_name": name,
|
||||
"queen_name": "queen_technology",
|
||||
"created_at": "2026-04-20T00:00:00+00:00",
|
||||
}
|
||||
)
|
||||
)
|
||||
return colonies, name
|
||||
|
||||
|
||||
async def _app(manager: _FakeManager) -> web.Application:
|
||||
app = web.Application()
|
||||
app["manager"] = manager
|
||||
routes_colony_tools.register_routes(app)
|
||||
return app
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_tools_default_allow(colony_dir):
|
||||
_, name = colony_dir
|
||||
manager = _FakeManager(
|
||||
_mcp_tool_catalog={
|
||||
"files-tools": [
|
||||
{"name": "read_file", "description": "read", "input_schema": {}},
|
||||
{"name": "write_file", "description": "write", "input_schema": {}},
|
||||
],
|
||||
}
|
||||
)
|
||||
app = await _app(manager)
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.get(f"/api/colony/{name}/tools")
|
||||
assert resp.status == 200
|
||||
body = await resp.json()
|
||||
assert body["enabled_mcp_tools"] is None
|
||||
assert body["stale"] is False
|
||||
tools = {t["name"]: t for t in body["mcp_servers"][0]["tools"]}
|
||||
assert all(t["enabled"] for t in tools.values())
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_patch_persists_and_validates(colony_dir):
|
||||
colonies_dir, name = colony_dir
|
||||
manager = _FakeManager(
|
||||
_mcp_tool_catalog={
|
||||
"files-tools": [
|
||||
{"name": "read_file", "description": "", "input_schema": {}},
|
||||
{"name": "write_file", "description": "", "input_schema": {}},
|
||||
]
|
||||
}
|
||||
)
|
||||
app = await _app(manager)
|
||||
tools_path = colonies_dir / name / "tools.json"
|
||||
metadata_path = colonies_dir / name / "metadata.json"
|
||||
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.patch(f"/api/colony/{name}/tools", json={"enabled_mcp_tools": ["read_file"]})
|
||||
assert resp.status == 200
|
||||
body = await resp.json()
|
||||
assert body["enabled_mcp_tools"] == ["read_file"]
|
||||
|
||||
# Persisted to tools.json; metadata.json does not carry the field.
|
||||
sidecar = json.loads(tools_path.read_text())
|
||||
assert sidecar["enabled_mcp_tools"] == ["read_file"]
|
||||
assert "updated_at" in sidecar
|
||||
meta = json.loads(metadata_path.read_text())
|
||||
assert "enabled_mcp_tools" not in meta
|
||||
|
||||
# GET reflects the allowlist
|
||||
resp = await client.get(f"/api/colony/{name}/tools")
|
||||
body = await resp.json()
|
||||
tools = {t["name"]: t for t in body["mcp_servers"][0]["tools"]}
|
||||
assert tools["read_file"]["enabled"] is True
|
||||
assert tools["write_file"]["enabled"] is False
|
||||
|
||||
# Unknown → 400
|
||||
resp = await client.patch(f"/api/colony/{name}/tools", json={"enabled_mcp_tools": ["ghost"]})
|
||||
assert resp.status == 400
|
||||
assert "ghost" in (await resp.json()).get("unknown", [])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_patch_refreshes_live_runtime(colony_dir):
|
||||
_, name = colony_dir
|
||||
|
||||
rt = _bare_runtime()
|
||||
rt._mcp_tool_names_all = {"read_file", "write_file"}
|
||||
rt.set_tool_allowlist(None)
|
||||
|
||||
session = _FakeSession(colony_name=name, colony=rt)
|
||||
manager = _FakeManager(
|
||||
_sessions={session.id: session},
|
||||
_mcp_tool_catalog={
|
||||
"files-tools": [
|
||||
{"name": "read_file", "description": "", "input_schema": {}},
|
||||
{"name": "write_file", "description": "", "input_schema": {}},
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
app = await _app(manager)
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.patch(f"/api/colony/{name}/tools", json={"enabled_mcp_tools": ["read_file"]})
|
||||
assert resp.status == 200
|
||||
body = await resp.json()
|
||||
assert body["refreshed_runtimes"] == 1
|
||||
assert rt._enabled_mcp_tools == ["read_file"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_404_for_unknown_colony(colony_dir):
|
||||
manager = _FakeManager()
|
||||
app = await _app(manager)
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.get("/api/colony/unknown/tools")
|
||||
assert resp.status == 404
|
||||
resp = await client.patch("/api/colony/unknown/tools", json={"enabled_mcp_tools": None})
|
||||
assert resp.status == 404
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tools_index_lists_colonies(colony_dir):
|
||||
_, name = colony_dir
|
||||
manager = _FakeManager()
|
||||
app = await _app(manager)
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.get("/api/colonies/tools-index")
|
||||
assert resp.status == 200
|
||||
body = await resp.json()
|
||||
entries = {c["name"]: c for c in body["colonies"]}
|
||||
assert name in entries
|
||||
assert entries[name]["queen_name"] == "queen_technology"
|
||||
assert entries[name]["has_allowlist"] is False
|
||||
|
||||
|
||||
def test_queen_allowlist_inherits_into_new_colony(tmp_path, monkeypatch):
|
||||
"""A colony forked with a curated queen inherits her allowlist.
|
||||
|
||||
Exercises the inheritance hook in
|
||||
``routes_execution.fork_session_into_colony`` without running the
|
||||
full fork machinery — we just call
|
||||
``update_colony_tools_config`` the same way the hook does and
|
||||
assert the colony's ``tools.json`` matches the queen's live list.
|
||||
"""
|
||||
colonies = tmp_path / "colonies"
|
||||
colonies.mkdir()
|
||||
monkeypatch.setattr("framework.host.colony_tools_config.COLONIES_DIR", colonies)
|
||||
|
||||
from framework.host.colony_tools_config import (
|
||||
load_colony_tools_config,
|
||||
update_colony_tools_config,
|
||||
)
|
||||
|
||||
colony_name = "forked_child"
|
||||
(colonies / colony_name).mkdir()
|
||||
|
||||
# Simulate: queen has a curated allowlist (e.g. role default resolved
|
||||
# to a concrete list). The inheritance hook copies it verbatim.
|
||||
queen_live_allowlist = ["read_file", "web_scrape", "csv_read"]
|
||||
update_colony_tools_config(colony_name, list(queen_live_allowlist))
|
||||
|
||||
assert load_colony_tools_config(colony_name) == queen_live_allowlist
|
||||
|
||||
|
||||
def test_legacy_metadata_field_migrates_to_sidecar(colony_dir):
|
||||
"""A legacy enabled_mcp_tools field in metadata.json is hoisted to tools.json."""
|
||||
colonies_dir, name = colony_dir
|
||||
meta_path = colonies_dir / name / "metadata.json"
|
||||
tools_path = colonies_dir / name / "tools.json"
|
||||
|
||||
# Seed legacy field in metadata.json.
|
||||
meta = json.loads(meta_path.read_text())
|
||||
meta["enabled_mcp_tools"] = ["read_file"]
|
||||
meta_path.write_text(json.dumps(meta))
|
||||
|
||||
from framework.host.colony_tools_config import load_colony_tools_config
|
||||
|
||||
# First load migrates.
|
||||
assert load_colony_tools_config(name) == ["read_file"]
|
||||
assert tools_path.exists()
|
||||
sidecar = json.loads(tools_path.read_text())
|
||||
assert sidecar["enabled_mcp_tools"] == ["read_file"]
|
||||
|
||||
# metadata.json no longer contains the field; provenance fields preserved.
|
||||
migrated = json.loads(meta_path.read_text())
|
||||
assert "enabled_mcp_tools" not in migrated
|
||||
assert migrated["queen_name"] == "queen_technology"
|
||||
|
||||
# Second load is a direct sidecar read.
|
||||
assert load_colony_tools_config(name) == ["read_file"]
|
||||
@@ -0,0 +1,239 @@
|
||||
"""Tests for the MCP server CRUD HTTP routes.
|
||||
|
||||
Monkey-patches ``MCPRegistry`` inside ``routes_mcp`` so the HTTP layer is
|
||||
exercised without reading or writing ``~/.hive/mcp_registry/installed.json``
|
||||
or spawning actual subprocesses.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from aiohttp import web
|
||||
from aiohttp.test_utils import TestClient, TestServer
|
||||
|
||||
from framework.loader.mcp_errors import MCPError, MCPErrorCode
|
||||
from framework.server import routes_mcp
|
||||
|
||||
|
||||
class _FakeRegistry:
|
||||
"""Stand-in for MCPRegistry — just enough surface for the routes."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._servers: dict[str, dict[str, Any]] = {
|
||||
"built-in-seed": {
|
||||
"source": "registry",
|
||||
"transport": "stdio",
|
||||
"enabled": True,
|
||||
"manifest": {"description": "Factory-seeded server", "tools": []},
|
||||
"last_health_status": "healthy",
|
||||
"last_error": None,
|
||||
"last_health_check_at": None,
|
||||
}
|
||||
}
|
||||
|
||||
def initialize(self) -> None: # noqa: D401 — registry idempotent init
|
||||
return
|
||||
|
||||
def list_installed(self) -> list[dict[str, Any]]:
|
||||
return [{"name": name, **entry} for name, entry in self._servers.items()]
|
||||
|
||||
def get_server(self, name: str) -> dict | None:
|
||||
if name not in self._servers:
|
||||
return None
|
||||
return {"name": name, **self._servers[name]}
|
||||
|
||||
def add_local(self, *, name: str, transport: str, **kwargs: Any) -> dict:
|
||||
if name in self._servers:
|
||||
raise MCPError(
|
||||
code=MCPErrorCode.MCP_INSTALL_FAILED,
|
||||
what=f"Server '{name}' already exists",
|
||||
why="A server with this name is already registered locally.",
|
||||
fix=f"Run: hive mcp remove {name}",
|
||||
)
|
||||
entry = {
|
||||
"source": "local",
|
||||
"transport": transport,
|
||||
"enabled": True,
|
||||
"manifest": {"description": kwargs.get("description") or ""},
|
||||
"last_health_status": None,
|
||||
"last_error": None,
|
||||
"last_health_check_at": None,
|
||||
}
|
||||
self._servers[name] = entry
|
||||
return entry
|
||||
|
||||
def remove(self, name: str) -> None:
|
||||
if name not in self._servers:
|
||||
raise MCPError(
|
||||
code=MCPErrorCode.MCP_INSTALL_FAILED,
|
||||
what=f"Cannot remove server '{name}'",
|
||||
why="Server is not installed.",
|
||||
fix="Run: hive mcp list",
|
||||
)
|
||||
del self._servers[name]
|
||||
|
||||
def enable(self, name: str) -> None:
|
||||
if name not in self._servers:
|
||||
raise MCPError(
|
||||
code=MCPErrorCode.MCP_INSTALL_FAILED,
|
||||
what="not found",
|
||||
why="not found",
|
||||
fix="x",
|
||||
)
|
||||
self._servers[name]["enabled"] = True
|
||||
|
||||
def disable(self, name: str) -> None:
|
||||
if name not in self._servers:
|
||||
raise MCPError(
|
||||
code=MCPErrorCode.MCP_INSTALL_FAILED,
|
||||
what="not found",
|
||||
why="not found",
|
||||
fix="x",
|
||||
)
|
||||
self._servers[name]["enabled"] = False
|
||||
|
||||
def health_check(self, name: str) -> dict[str, Any]:
|
||||
if name not in self._servers:
|
||||
raise MCPError(
|
||||
code=MCPErrorCode.MCP_HEALTH_FAILED,
|
||||
what="not found",
|
||||
why="not found",
|
||||
fix="x",
|
||||
)
|
||||
return {"name": name, "status": "healthy", "tools": 3, "error": None}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def registry(monkeypatch):
|
||||
reg = _FakeRegistry()
|
||||
monkeypatch.setattr(routes_mcp, "_registry", lambda: reg)
|
||||
return reg
|
||||
|
||||
|
||||
async def _make_app() -> web.Application:
|
||||
app = web.Application()
|
||||
routes_mcp.register_routes(app)
|
||||
return app
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_list_servers_returns_built_in(registry):
|
||||
app = await _make_app()
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.get("/api/mcp/servers")
|
||||
assert resp.status == 200
|
||||
body = await resp.json()
|
||||
names = {s["name"] for s in body["servers"]}
|
||||
# The registry fake carries one entry; the list also merges package-
|
||||
# baked entries from core/framework/agents/queen/mcp_servers.json so
|
||||
# the UI matches what the queen actually loads. Both should appear.
|
||||
assert "built-in-seed" in names
|
||||
sources = {s["name"]: s["source"] for s in body["servers"]}
|
||||
assert sources.get("built-in-seed") == "registry"
|
||||
# The package-baked servers (files-tools/gcu-tools/hive_tools) carry
|
||||
# source=="built-in" and are non-removable.
|
||||
pkg_entries = [s for s in body["servers"] if s["source"] == "built-in"]
|
||||
assert pkg_entries, "expected at least one package-baked MCP server"
|
||||
assert all(s.get("removable") is False for s in pkg_entries)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_local_server(registry):
|
||||
app = await _make_app()
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.post(
|
||||
"/api/mcp/servers",
|
||||
json={
|
||||
"name": "my-tool",
|
||||
"transport": "stdio",
|
||||
"command": "echo",
|
||||
"args": ["hi"],
|
||||
"description": "says hi",
|
||||
},
|
||||
)
|
||||
assert resp.status == 201
|
||||
body = await resp.json()
|
||||
assert body["server"]["name"] == "my-tool"
|
||||
assert body["server"]["source"] == "local"
|
||||
|
||||
resp = await client.get("/api/mcp/servers")
|
||||
names = [s["name"] for s in (await resp.json())["servers"]]
|
||||
assert "my-tool" in names
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_rejects_duplicate(registry):
|
||||
app = await _make_app()
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
for _ in range(2):
|
||||
resp = await client.post(
|
||||
"/api/mcp/servers",
|
||||
json={"name": "dup", "transport": "stdio", "command": "x"},
|
||||
)
|
||||
assert resp.status == 409
|
||||
body = await resp.json()
|
||||
assert "already exists" in body["error"].lower()
|
||||
assert body["fix"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_rejects_invalid_transport(registry):
|
||||
app = await _make_app()
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.post(
|
||||
"/api/mcp/servers",
|
||||
json={"name": "x", "transport": "nope"},
|
||||
)
|
||||
assert resp.status == 400
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_enable_disable_cycle(registry):
|
||||
app = await _make_app()
|
||||
# Seed a local server
|
||||
registry.add_local(name="local-one", transport="stdio", command="x")
|
||||
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.post("/api/mcp/servers/local-one/disable")
|
||||
assert resp.status == 200
|
||||
assert (await resp.json())["enabled"] is False
|
||||
assert registry._servers["local-one"]["enabled"] is False
|
||||
|
||||
resp = await client.post("/api/mcp/servers/local-one/enable")
|
||||
assert resp.status == 200
|
||||
assert (await resp.json())["enabled"] is True
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_remove_local_only(registry):
|
||||
app = await _make_app()
|
||||
registry.add_local(name="local-two", transport="stdio", command="x")
|
||||
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
# Built-ins are protected
|
||||
resp = await client.delete("/api/mcp/servers/built-in-seed")
|
||||
assert resp.status == 400
|
||||
|
||||
# Missing
|
||||
resp = await client.delete("/api/mcp/servers/ghost")
|
||||
assert resp.status == 404
|
||||
|
||||
# Happy path
|
||||
resp = await client.delete("/api/mcp/servers/local-two")
|
||||
assert resp.status == 200
|
||||
assert "local-two" not in registry._servers
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_health_check(registry, monkeypatch):
|
||||
app = await _make_app()
|
||||
registry.add_local(name="pingable", transport="stdio", command="x")
|
||||
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.post("/api/mcp/servers/pingable/health")
|
||||
assert resp.status == 200
|
||||
body = await resp.json()
|
||||
assert body["status"] == "healthy"
|
||||
assert body["tools"] == 3
|
||||
@@ -0,0 +1,486 @@
|
||||
"""Tests for the per-queen MCP tool allowlist filter + routes.
|
||||
|
||||
Covers:
|
||||
1. QueenPhaseState filter semantics (default-allow, allowlist, empty, phase-
|
||||
isolation, memo identity for LLM prompt-cache stability).
|
||||
2. routes_queen_tools round trip (GET, PATCH, validation, live-session
|
||||
hot-reload).
|
||||
|
||||
Route tests monkey-patch a tiny queen profile + manager catalog; they never
|
||||
spawn an MCP subprocess.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
from aiohttp import web
|
||||
from aiohttp.test_utils import TestClient, TestServer
|
||||
|
||||
from framework.llm.provider import Tool
|
||||
from framework.server import routes_queen_tools
|
||||
from framework.tools.queen_lifecycle_tools import QueenPhaseState
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# QueenPhaseState filter — pure unit tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _tool(name: str) -> Tool:
|
||||
return Tool(name=name, description=f"desc of {name}", parameters={"type": "object"})
|
||||
|
||||
|
||||
class TestPhaseStateFilter:
|
||||
def test_default_allow_returns_every_tool(self):
|
||||
ps = QueenPhaseState(phase="independent")
|
||||
ps.independent_tools = [_tool("mcp_a"), _tool("mcp_b"), _tool("lc_c")]
|
||||
ps.mcp_tool_names_all = {"mcp_a", "mcp_b"}
|
||||
ps.enabled_mcp_tools = None
|
||||
ps.rebuild_independent_filter()
|
||||
|
||||
names = [t.name for t in ps.get_current_tools()]
|
||||
assert names == ["mcp_a", "mcp_b", "lc_c"]
|
||||
|
||||
def test_allowlist_keeps_listed_mcp_plus_all_lifecycle(self):
|
||||
ps = QueenPhaseState(phase="independent")
|
||||
ps.independent_tools = [_tool("mcp_a"), _tool("mcp_b"), _tool("lc_c")]
|
||||
ps.mcp_tool_names_all = {"mcp_a", "mcp_b"}
|
||||
ps.enabled_mcp_tools = ["mcp_a"]
|
||||
ps.rebuild_independent_filter()
|
||||
|
||||
names = [t.name for t in ps.get_current_tools()]
|
||||
assert names == ["mcp_a", "lc_c"]
|
||||
|
||||
def test_empty_allowlist_keeps_only_lifecycle(self):
|
||||
ps = QueenPhaseState(phase="independent")
|
||||
ps.independent_tools = [_tool("mcp_a"), _tool("mcp_b"), _tool("lc_c")]
|
||||
ps.mcp_tool_names_all = {"mcp_a", "mcp_b"}
|
||||
ps.enabled_mcp_tools = []
|
||||
ps.rebuild_independent_filter()
|
||||
|
||||
names = [t.name for t in ps.get_current_tools()]
|
||||
assert names == ["lc_c"]
|
||||
|
||||
def test_filter_isolated_to_independent_phase(self):
|
||||
ps = QueenPhaseState(phase="independent")
|
||||
ps.independent_tools = [_tool("mcp_a"), _tool("lc_c")]
|
||||
ps.working_tools = [_tool("mcp_a"), _tool("lc_c")]
|
||||
ps.mcp_tool_names_all = {"mcp_a"}
|
||||
ps.enabled_mcp_tools = []
|
||||
ps.rebuild_independent_filter()
|
||||
|
||||
# Independent → filtered
|
||||
assert [t.name for t in ps.get_current_tools()] == ["lc_c"]
|
||||
|
||||
# Other phases → unaffected
|
||||
ps.phase = "working"
|
||||
assert [t.name for t in ps.get_current_tools()] == ["mcp_a", "lc_c"]
|
||||
|
||||
def test_memo_returns_stable_identity_for_prompt_cache(self):
|
||||
"""Same Python list object across turns → LLM prompt cache stays warm."""
|
||||
ps = QueenPhaseState(phase="independent")
|
||||
ps.independent_tools = [_tool("mcp_a"), _tool("lc_c")]
|
||||
ps.mcp_tool_names_all = {"mcp_a"}
|
||||
ps.enabled_mcp_tools = None
|
||||
ps.rebuild_independent_filter()
|
||||
|
||||
first = ps.get_current_tools()
|
||||
second = ps.get_current_tools()
|
||||
assert first is second, "memoized list must be the same object across turns"
|
||||
|
||||
# A rebuild should produce a different object so downstream caches
|
||||
# correctly invalidate.
|
||||
ps.enabled_mcp_tools = ["mcp_a"]
|
||||
ps.rebuild_independent_filter()
|
||||
third = ps.get_current_tools()
|
||||
assert third is not first
|
||||
assert [t.name for t in third] == ["mcp_a", "lc_c"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Route round-trip tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class _FakeSession:
|
||||
queen_name: str
|
||||
phase_state: QueenPhaseState
|
||||
colony_runtime: Any = None
|
||||
id: str = "sess-1"
|
||||
_queen_tool_registry: Any = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class _FakeManager:
|
||||
_sessions: dict = field(default_factory=dict)
|
||||
_mcp_tool_catalog: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def queen_dir(tmp_path, monkeypatch):
|
||||
"""Redirect queen profile + tools storage into a tmp dir."""
|
||||
queens_dir = tmp_path / "queens"
|
||||
queens_dir.mkdir()
|
||||
monkeypatch.setattr("framework.agents.queen.queen_profiles.QUEENS_DIR", queens_dir)
|
||||
monkeypatch.setattr("framework.agents.queen.queen_tools_config.QUEENS_DIR", queens_dir)
|
||||
|
||||
queen_id = "queen_technology"
|
||||
(queens_dir / queen_id).mkdir()
|
||||
(queens_dir / queen_id / "profile.yaml").write_text(
|
||||
yaml.safe_dump({"name": "Alexandra", "title": "Head of Technology"})
|
||||
)
|
||||
return queens_dir, queen_id
|
||||
|
||||
|
||||
async def _make_app(*, manager: _FakeManager) -> web.Application:
|
||||
app = web.Application()
|
||||
app["manager"] = manager
|
||||
routes_queen_tools.register_routes(app)
|
||||
return app
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_tools_default_allows_everything_for_unknown_queen(queen_dir, monkeypatch):
|
||||
"""Queens NOT in the role-default table fall back to allow-all."""
|
||||
monkeypatch.setattr(routes_queen_tools, "ensure_default_queens", lambda: None)
|
||||
|
||||
queens_dir, _ = queen_dir
|
||||
# Use a queen id that isn't in QUEEN_DEFAULT_CATEGORIES so we exercise
|
||||
# the fallback-to-allow-all path.
|
||||
custom_id = "queen_custom_unknown"
|
||||
(queens_dir / custom_id).mkdir()
|
||||
(queens_dir / custom_id / "profile.yaml").write_text(yaml.safe_dump({"name": "Custom", "title": "Custom Role"}))
|
||||
|
||||
manager = _FakeManager()
|
||||
manager._mcp_tool_catalog = {
|
||||
"files-tools": [
|
||||
{"name": "read_file", "description": "read", "input_schema": {}},
|
||||
{"name": "write_file", "description": "write", "input_schema": {}},
|
||||
],
|
||||
}
|
||||
|
||||
app = await _make_app(manager=manager)
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.get(f"/api/queen/{custom_id}/tools")
|
||||
assert resp.status == 200
|
||||
body = await resp.json()
|
||||
|
||||
assert body["enabled_mcp_tools"] is None
|
||||
assert body["is_role_default"] is True # no sidecar → default-allow
|
||||
assert body["stale"] is False
|
||||
servers = {s["name"]: s for s in body["mcp_servers"]}
|
||||
assert set(servers) == {"files-tools"}
|
||||
for tool in servers["files-tools"]["tools"]:
|
||||
assert tool["enabled"] is True
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_tools_applies_role_default(queen_dir, monkeypatch):
|
||||
"""Known persona queens get their role-based default allowlist."""
|
||||
monkeypatch.setattr(routes_queen_tools, "ensure_default_queens", lambda: None)
|
||||
_, queen_id = queen_dir # queen_technology — has a role default
|
||||
|
||||
manager = _FakeManager()
|
||||
# Seed two MCP servers: files-tools is referenced by the technology
|
||||
# role via the @server:files-tools shorthand in `file_ops`, so its
|
||||
# tools should bubble into the default. unrelated-server is NOT
|
||||
# referenced by any role category — its tools must NOT leak in.
|
||||
manager._mcp_tool_catalog = {
|
||||
"files-tools": [
|
||||
{"name": "read_file", "description": "", "input_schema": {}},
|
||||
{"name": "edit_file", "description": "", "input_schema": {}},
|
||||
],
|
||||
"unrelated-server": [
|
||||
{"name": "fluffy_unknown_tool", "description": "", "input_schema": {}},
|
||||
],
|
||||
}
|
||||
|
||||
app = await _make_app(manager=manager)
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.get(f"/api/queen/{queen_id}/tools")
|
||||
assert resp.status == 200
|
||||
body = await resp.json()
|
||||
|
||||
assert body["is_role_default"] is True
|
||||
enabled = set(body["enabled_mcp_tools"] or [])
|
||||
# @server:files-tools shorthand pulls in every tool under that server.
|
||||
assert "read_file" in enabled
|
||||
assert "edit_file" in enabled
|
||||
# Tools registered under a server the role doesn't reference are NOT
|
||||
# part of the default.
|
||||
assert "fluffy_unknown_tool" not in enabled
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_tools_exposes_categories(queen_dir, monkeypatch):
|
||||
"""Response includes the category catalog with role-default flags."""
|
||||
monkeypatch.setattr(routes_queen_tools, "ensure_default_queens", lambda: None)
|
||||
_, queen_id = queen_dir # queen_technology
|
||||
|
||||
manager = _FakeManager()
|
||||
manager._mcp_tool_catalog = {
|
||||
"files-tools": [
|
||||
{"name": "read_file", "description": "", "input_schema": {}},
|
||||
{"name": "edit_file", "description": "", "input_schema": {}},
|
||||
],
|
||||
}
|
||||
|
||||
app = await _make_app(manager=manager)
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.get(f"/api/queen/{queen_id}/tools")
|
||||
assert resp.status == 200
|
||||
body = await resp.json()
|
||||
|
||||
cats = {c["name"]: c for c in body["categories"]}
|
||||
# Categories that contribute to queen_technology's role default
|
||||
assert cats["file_ops"]["in_role_default"] is True
|
||||
assert cats["browser_basic"]["in_role_default"] is True
|
||||
# Spreadsheet category is exposed even though queen_technology doesn't
|
||||
# use it — frontend can group/show it.
|
||||
assert "spreadsheet_advanced" in cats
|
||||
assert cats["spreadsheet_advanced"]["in_role_default"] is False
|
||||
# Security was removed from queen_technology defaults.
|
||||
assert cats["security"]["in_role_default"] is False
|
||||
# @server:files-tools shorthand expanded against the catalog.
|
||||
assert "read_file" in cats["file_ops"]["tools"]
|
||||
assert "edit_file" in cats["file_ops"]["tools"]
|
||||
|
||||
|
||||
def test_resolve_queen_default_tools_expands_server_shorthand():
|
||||
"""@server:NAME shorthand expands against the provided catalog."""
|
||||
from framework.agents.queen.queen_tools_defaults import resolve_queen_default_tools
|
||||
|
||||
catalog = {
|
||||
"files-tools": [
|
||||
{"name": "read_file"},
|
||||
{"name": "write_file"},
|
||||
],
|
||||
}
|
||||
# queen_brand_design uses "file_ops" category → expands via @server:files-tools.
|
||||
result = resolve_queen_default_tools("queen_brand_design", catalog)
|
||||
assert result is not None
|
||||
assert "read_file" in result
|
||||
assert "write_file" in result
|
||||
|
||||
|
||||
def test_resolve_queen_default_tools_unknown_queen_returns_none():
|
||||
from framework.agents.queen.queen_tools_defaults import resolve_queen_default_tools
|
||||
|
||||
assert resolve_queen_default_tools("queen_made_up", {}) is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_patch_persists_and_validates(queen_dir, monkeypatch):
|
||||
monkeypatch.setattr(routes_queen_tools, "ensure_default_queens", lambda: None)
|
||||
queens_dir, queen_id = queen_dir
|
||||
|
||||
manager = _FakeManager()
|
||||
manager._mcp_tool_catalog = {
|
||||
"files-tools": [
|
||||
{"name": "read_file", "description": "", "input_schema": {}},
|
||||
{"name": "write_file", "description": "", "input_schema": {}},
|
||||
]
|
||||
}
|
||||
|
||||
app = await _make_app(manager=manager)
|
||||
tools_path = queens_dir / queen_id / "tools.json"
|
||||
profile_path = queens_dir / queen_id / "profile.yaml"
|
||||
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
# Happy path
|
||||
resp = await client.patch(
|
||||
f"/api/queen/{queen_id}/tools",
|
||||
json={"enabled_mcp_tools": ["read_file"]},
|
||||
)
|
||||
assert resp.status == 200
|
||||
body = await resp.json()
|
||||
assert body["enabled_mcp_tools"] == ["read_file"]
|
||||
|
||||
# Sidecar persisted; profile YAML untouched by tools PATCH
|
||||
sidecar = json.loads(tools_path.read_text())
|
||||
assert sidecar["enabled_mcp_tools"] == ["read_file"]
|
||||
assert "updated_at" in sidecar
|
||||
profile = yaml.safe_load(profile_path.read_text())
|
||||
assert "enabled_mcp_tools" not in profile
|
||||
|
||||
# GET reflects the new state
|
||||
resp = await client.get(f"/api/queen/{queen_id}/tools")
|
||||
body = await resp.json()
|
||||
assert body["is_role_default"] is False # user has explicitly saved
|
||||
servers = {t["name"]: t for t in body["mcp_servers"][0]["tools"]}
|
||||
assert servers["read_file"]["enabled"] is True
|
||||
assert servers["write_file"]["enabled"] is False
|
||||
|
||||
# Null resets
|
||||
resp = await client.patch(f"/api/queen/{queen_id}/tools", json={"enabled_mcp_tools": None})
|
||||
assert resp.status == 200
|
||||
body = await resp.json()
|
||||
assert body["enabled_mcp_tools"] is None
|
||||
sidecar = json.loads(tools_path.read_text())
|
||||
assert sidecar["enabled_mcp_tools"] is None
|
||||
|
||||
# Unknown tool name → 400; sidecar unchanged
|
||||
resp = await client.patch(
|
||||
f"/api/queen/{queen_id}/tools",
|
||||
json={"enabled_mcp_tools": ["nope_not_a_tool"]},
|
||||
)
|
||||
assert resp.status == 400
|
||||
detail = await resp.json()
|
||||
assert "nope_not_a_tool" in detail.get("unknown", [])
|
||||
sidecar = json.loads(tools_path.read_text())
|
||||
assert sidecar["enabled_mcp_tools"] is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_patch_hot_reloads_live_session(queen_dir, monkeypatch):
|
||||
monkeypatch.setattr(routes_queen_tools, "ensure_default_queens", lambda: None)
|
||||
_, queen_id = queen_dir
|
||||
|
||||
# Build a fake live session whose phase state carries a tool list the
|
||||
# filter can gate. We also need a fake registry so
|
||||
# _catalog_from_live_session can enumerate tools.
|
||||
class _FakeRegistry:
|
||||
def __init__(self, server_map, tools_by_name):
|
||||
self._mcp_server_tools = server_map
|
||||
self._tools_by_name = tools_by_name
|
||||
|
||||
def get_tools(self):
|
||||
return {n: MagicMock(name=n) for n in self._tools_by_name}
|
||||
|
||||
tools_by_name = {"read_file": _tool("read_file"), "write_file": _tool("write_file")}
|
||||
registry = _FakeRegistry(
|
||||
server_map={"files-tools": {"read_file", "write_file"}},
|
||||
tools_by_name=tools_by_name,
|
||||
)
|
||||
# Patch get_tools to return real Tool objects for name/description plumbing.
|
||||
registry.get_tools = lambda: tools_by_name # type: ignore[method-assign]
|
||||
|
||||
phase_state = QueenPhaseState(phase="independent")
|
||||
phase_state.independent_tools = [tools_by_name["read_file"], tools_by_name["write_file"]]
|
||||
phase_state.mcp_tool_names_all = {"read_file", "write_file"}
|
||||
phase_state.enabled_mcp_tools = None
|
||||
phase_state.rebuild_independent_filter()
|
||||
|
||||
session = _FakeSession(queen_name=queen_id, phase_state=phase_state)
|
||||
session._queen_tool_registry = registry
|
||||
manager = _FakeManager(_sessions={"sess-1": session})
|
||||
|
||||
app = await _make_app(manager=manager)
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.patch(
|
||||
f"/api/queen/{queen_id}/tools",
|
||||
json={"enabled_mcp_tools": ["read_file"]},
|
||||
)
|
||||
assert resp.status == 200
|
||||
body = await resp.json()
|
||||
assert body["refreshed_sessions"] == 1
|
||||
|
||||
# Session's phase state reflects the new allowlist without a restart
|
||||
current = phase_state.get_current_tools()
|
||||
assert [t.name for t in current] == ["read_file"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_missing_queen_returns_404(queen_dir, monkeypatch):
|
||||
monkeypatch.setattr(routes_queen_tools, "ensure_default_queens", lambda: None)
|
||||
manager = _FakeManager()
|
||||
|
||||
app = await _make_app(manager=manager)
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
resp = await client.get("/api/queen/queen_nonexistent/tools")
|
||||
assert resp.status == 404
|
||||
|
||||
resp = await client.patch(
|
||||
"/api/queen/queen_nonexistent/tools",
|
||||
json={"enabled_mcp_tools": None},
|
||||
)
|
||||
assert resp.status == 404
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_delete_restores_role_default(queen_dir, monkeypatch):
|
||||
"""DELETE removes tools.json so the queen falls back to the role default."""
|
||||
monkeypatch.setattr(routes_queen_tools, "ensure_default_queens", lambda: None)
|
||||
queens_dir, queen_id = queen_dir
|
||||
tools_path = queens_dir / queen_id / "tools.json"
|
||||
|
||||
manager = _FakeManager()
|
||||
manager._mcp_tool_catalog = {
|
||||
"files-tools": [
|
||||
{"name": "read_file", "description": "", "input_schema": {}},
|
||||
# pdf_read lives in hive_tools but is named explicitly in the
|
||||
# file_ops category, so we stage it in any server here just to
|
||||
# surface it through the catalog.
|
||||
{"name": "pdf_read", "description": "", "input_schema": {}},
|
||||
],
|
||||
}
|
||||
|
||||
app = await _make_app(manager=manager)
|
||||
async with TestClient(TestServer(app)) as client:
|
||||
# Seed a custom allowlist first so we have a sidecar to delete.
|
||||
resp = await client.patch(
|
||||
f"/api/queen/{queen_id}/tools",
|
||||
json={"enabled_mcp_tools": ["read_file"]},
|
||||
)
|
||||
assert resp.status == 200
|
||||
assert tools_path.exists()
|
||||
|
||||
resp = await client.delete(f"/api/queen/{queen_id}/tools")
|
||||
assert resp.status == 200
|
||||
body = await resp.json()
|
||||
assert body["removed"] is True
|
||||
assert body["is_role_default"] is True
|
||||
assert not tools_path.exists()
|
||||
|
||||
# The new effective list is the role default for queen_technology;
|
||||
# security tools were intentionally removed, so port_scan must NOT
|
||||
# appear, while file_ops members like read_file/pdf_read do.
|
||||
enabled = set(body["enabled_mcp_tools"] or [])
|
||||
assert "read_file" in enabled
|
||||
assert "pdf_read" in enabled
|
||||
assert "port_scan" not in enabled
|
||||
assert "subdomain_enumerate" not in enabled
|
||||
|
||||
# GET confirms.
|
||||
resp = await client.get(f"/api/queen/{queen_id}/tools")
|
||||
body = await resp.json()
|
||||
assert body["is_role_default"] is True
|
||||
|
||||
# Deleting again is a no-op.
|
||||
resp = await client.delete(f"/api/queen/{queen_id}/tools")
|
||||
assert resp.status == 200
|
||||
assert (await resp.json())["removed"] is False
|
||||
|
||||
|
||||
def test_legacy_profile_field_migrates_to_sidecar(queen_dir):
|
||||
"""A legacy enabled_mcp_tools field in profile.yaml is hoisted to tools.json."""
|
||||
queens_dir, queen_id = queen_dir
|
||||
profile_path = queens_dir / queen_id / "profile.yaml"
|
||||
tools_path = queens_dir / queen_id / "tools.json"
|
||||
|
||||
# Seed legacy field in profile.yaml.
|
||||
profile = yaml.safe_load(profile_path.read_text()) or {}
|
||||
profile["enabled_mcp_tools"] = ["read_file", "write_file"]
|
||||
profile_path.write_text(yaml.safe_dump(profile, sort_keys=False))
|
||||
|
||||
from framework.agents.queen.queen_tools_config import load_queen_tools_config
|
||||
|
||||
# First load migrates.
|
||||
assert load_queen_tools_config(queen_id) == ["read_file", "write_file"]
|
||||
assert tools_path.exists()
|
||||
sidecar = json.loads(tools_path.read_text())
|
||||
assert sidecar["enabled_mcp_tools"] == ["read_file", "write_file"]
|
||||
|
||||
# profile.yaml no longer contains the field; other fields preserved.
|
||||
migrated_profile = yaml.safe_load(profile_path.read_text())
|
||||
assert "enabled_mcp_tools" not in migrated_profile
|
||||
assert migrated_profile["name"] == "Alexandra"
|
||||
|
||||
# Second load is a direct read — no migration work to do.
|
||||
assert load_queen_tools_config(queen_id) == ["read_file", "write_file"]
|
||||
@@ -11,7 +11,7 @@ metadata:
|
||||
|
||||
**Applies when** your spawn message has `db_path:` and `colony_id:` fields. The DB is your durable working memory — tells you what's done, what to skip, which SOP gates you owe.
|
||||
|
||||
Access via `execute_command_tool` running `sqlite3 "<db_path>" "..."`. Tables: `tasks` (queue), `steps` (per-task decomposition), `sop_checklist` (hard gates).
|
||||
Access via `terminal_exec` running `sqlite3 "<db_path>" "..."`. Tables: `tasks` (queue), `steps` (per-task decomposition), `sop_checklist` (hard gates).
|
||||
|
||||
### Claim: assigned task (check this FIRST)
|
||||
|
||||
|
||||
+4
-4
@@ -113,7 +113,7 @@ Even after `wait_until="load"`, React/Vue SPAs often render their real chrome in
|
||||
### Reading pages efficiently
|
||||
|
||||
- **Prefer `browser_snapshot` over `browser_get_text("body")`** — returns a compact ~1–5 KB accessibility tree vs 100+ KB of raw HTML.
|
||||
- Interaction tools `browser_click`, `browser_type`, `browser_type_focused`, `browser_fill`, and `browser_scroll` wait 0.5 s for the page to settle after a successful action, then attach a fresh accessibility snapshot under the `snapshot` key of their result. Use it to decide your next action — do NOT call `browser_snapshot` separately after every action. Tune the capture via `auto_snapshot_mode`: `"default"` (full tree, the default), `"simple"` (trims unnamed structural nodes), `"interactive"` (only controls — tightest token footprint), or `"off"` to skip the capture entirely (useful when batching several interactions and you don't need the intermediate trees). Call `browser_snapshot` explicitly only when you need a newer view or a different mode than what was auto-captured.
|
||||
- Interaction tools `browser_click`, `browser_type`, `browser_type_focused`, and `browser_scroll` wait 0.5 s for the page to settle after a successful action, then attach a fresh accessibility snapshot under the `snapshot` key of their result. Use it to decide your next action — do NOT call `browser_snapshot` separately after every action. Tune the capture via `auto_snapshot_mode`: `"default"` (full tree, the default), `"simple"` (trims unnamed structural nodes), `"interactive"` (only controls — tightest token footprint), or `"off"` to skip the capture entirely (useful when batching several interactions and you don't need the intermediate trees). Call `browser_snapshot` explicitly only when you need a newer view or a different mode than what was auto-captured.
|
||||
- Complex pages (LinkedIn, Twitter/X, SPAs with virtual scrolling) can have DOMs that don't match what's visually rendered — snapshot refs may be stale, missing, or misaligned with visible layout. Try the available snapshot first; when the target is not present in that snapshot or visual position matters, switch to `browser_screenshot` to orient yourself.
|
||||
- Only fall back to `browser_get_text` for extracting specific small elements by CSS selector.
|
||||
|
||||
@@ -244,8 +244,8 @@ The highlight overlay stays visible on the page for **10 seconds** after each in
|
||||
|
||||
**Close tabs as soon as you are done with them** — not only at the end of the task. After reading or extracting data from a tab, close it immediately.
|
||||
|
||||
- Finished reading/extracting from a tab? `browser_close(target_id=...)`
|
||||
- Completed a multi-tab workflow? `browser_close_finished()` to clean up all your tabs
|
||||
- Finished reading/extracting from a tab? `browser_close(tab_id=...)` (or no arg to close the active tab)
|
||||
- Completed a multi-tab workflow? Call `browser_close` for each tab you opened — list with `browser_tabs` first if you've lost track of IDs
|
||||
- More than 3 tabs open? Stop and close finished ones before opening more
|
||||
- Popup appeared that you didn't need? Close it immediately
|
||||
|
||||
@@ -410,7 +410,7 @@ In all of these cases the script is SHORT (< 10 lines) and the result is CONSUME
|
||||
- If a tool fails, retry once with the same approach.
|
||||
- If it fails a second time, STOP retrying and switch approach.
|
||||
- If `browser_snapshot` fails, try `browser_get_text` with a specific small selector as fallback.
|
||||
- If `browser_open` fails or page seems stale, `browser_stop`, then `browser_start`, then retry.
|
||||
- If `browser_open` fails or page seems stale, `browser_stop`, then `browser_open(url)` again to recreate a fresh context.
|
||||
|
||||
## Verified workflows
|
||||
|
||||
@@ -0,0 +1,160 @@
|
||||
---
|
||||
name: hive.chart-creation-foundations
|
||||
description: Required reading whenever any chart_* tool is available. Teaches the one-tool embedding contract (call chart_render → live chart appears in chat AND a downloadable PNG lands in the queen session dir), the ECharts (data viz) vs Mermaid (structural diagrams) decision, the BI/financial-grade aesthetic baseline (no chartjunk, restrained palette, proper typography, single message per chart), and the canonical spec patterns for the 12 most-common chart types. Skipping this leads to 1990s-Excel charts, missing downloads, and the agent writing markdown image links by hand instead of letting chart_render drive the UI.
|
||||
metadata:
|
||||
author: hive
|
||||
type: preset-skill
|
||||
version: "1.0"
|
||||
---
|
||||
|
||||
# Chart creation foundations
|
||||
|
||||
These tools render BI/financial-analyst-grade charts and diagrams that show up live in the chat AND save as high-DPI PNGs in the user's queen session dir.
|
||||
|
||||
## The embedding contract — one rule
|
||||
|
||||
> **To put a chart in chat, call `chart_render`. The chat reads `result.spec` and renders the chart live in the message bubble. The download link is `result.file_url`. Do not write `` image markdown by hand — the tool's result drives the UI.**
|
||||
|
||||
That's it. One tool call, one chart in chat, one file on disk. No two-step "remember to also save it" pattern. The chat's chart-rendering UI is fed by the tool result envelope automatically.
|
||||
|
||||
## When to chart at all
|
||||
|
||||
Chart when the data is **visual at heart**: trends over time, distributions, comparisons across categories, hierarchies, flows, geo. Skip the chart when:
|
||||
|
||||
- The point is one number → just say it. ("Revenue was $4.2M, up 12% YoY.")
|
||||
- The point is a ranking of 5 things → use a markdown table with bold and emoji indicators.
|
||||
- The data is so noisy a chart would mislead → describe the takeaway in prose.
|
||||
|
||||
A chart costs the user attention. It must repay that cost with a takeaway they couldn't get from prose.
|
||||
|
||||
## ECharts vs Mermaid — the picking rule
|
||||
|
||||
| Use ECharts (`kind: "echarts"`) when... | Use Mermaid (`kind: "mermaid"`) when... |
|
||||
|---|---|
|
||||
| You're plotting **numbers over categories or time** | You're showing **structure, not data** |
|
||||
| Bar / line / area / scatter / candlestick / heatmap / treemap / sankey / parallel coordinates / calendar / gauge / pie / sunburst / geo map | Flowchart / sequence / gantt / ERD / state diagram / mindmap / class diagram / C4 architecture |
|
||||
| The viewer's question is "how much / how many / what's the trend" | The viewer's question is "what calls what / what depends on what / what happens after what" |
|
||||
|
||||
If both fit (rare), prefer ECharts — its rasterized output is a proper data chart for slides; Mermaid's diagrams are for technical docs.
|
||||
|
||||
## The aesthetic baseline (non-negotiable)
|
||||
|
||||
These are the rules that turn an Excel-default chart into a Tableau-grade one. Every chart you produce must follow them.
|
||||
|
||||
### 1. Theme & background
|
||||
- `chart_render` has **no `theme` parameter**. The renderer reads the user's UI theme from the desktop env (`HIVE_DESKTOP_THEME`) so the saved PNG matches what the user is actually looking at. You don't pick; the system does.
|
||||
- Title goes in `option.title.text`, NOT in the message body. The chart is self-contained.
|
||||
|
||||
### 2. Palette discipline — DO NOT set `color` on series
|
||||
|
||||
The OpenHive ECharts theme is auto-applied to every `chart_render` call. It defines:
|
||||
- An 8-hue **categorical palette** for multi-series charts (honey orange, slate blue, sage, terracotta, bronze, indigo, olive, rust)
|
||||
- Cozy spacing (`grid.top: 90`, `grid.bottom: 56`, etc.)
|
||||
- Brand typography (Inter Tight)
|
||||
- Tasteful axis lines + dashed gridlines
|
||||
|
||||
**Do not set `option.color`, `option.title.textStyle`, `option.grid`, or `option.itemStyle.color` on series.** The theme covers it. If you do override, you'll fight the brand palette and the chart will look generic.
|
||||
|
||||
When you need data-encoded color (NOT category color):
|
||||
- **Sequential** (magnitude): use `visualMap` with `inRange.color: ['#fff7e0', '#db6f02']` (light-to-honey)
|
||||
- **Diverging** (positive/negative): use `visualMap` with `inRange.color: ['#a8453d', '#f5f5f5', '#3d7a4a']` (terracotta/neutral/sage)
|
||||
- **Semantic up/down** (candlestick is auto-themed): for explicit gain/loss bars use `#3d7a4a` (gain) and `#a8453d` (loss), NOT `#27ae60` / `#e74c3c`.
|
||||
|
||||
### 3. Typography
|
||||
The default font (`-apple-system, "Inter Tight", system-ui`) is already wired in the renderer — don't override unless the user asked. Set `option.textStyle.fontSize: 13` for body labels, `16` for axis names, `18` bold for the title.
|
||||
|
||||
### 4. No chartjunk
|
||||
- **No 3D**. Ever. 3D pie charts and 3D bar charts are visual lies.
|
||||
- **No drop shadows** on bars or lines. The default flat ECharts look is correct.
|
||||
- **No gradient fills** unless the gradient encodes data (e.g. heatmap fill).
|
||||
- **No neon colors**. Saturation belongs on highlighted bars, not on every series.
|
||||
- **No more than 5 stacked colors** in a stacked bar — past that the eye can't separate them.
|
||||
|
||||
### 5. Axis hygiene
|
||||
- X-axis labels rotate 45° only when they overflow. Otherwise horizontal.
|
||||
- Y-axis starts at 0 for bar/area charts (truncating misleads). Line charts can start at min - 5%.
|
||||
- Use `option.yAxis.axisLabel.formatter: '{value} M'` to add units, NOT a separate "USD millions" subtitle.
|
||||
- Date axes: pass ISO strings (`"2024-01-15"`) and ECharts handles the layout. Use `xAxis.type: "time"`.
|
||||
|
||||
### 6. One message per chart
|
||||
Every chart goes in its own assistant message (or its own `chart_render` call). Do not pile 4 charts into one wall of tool calls — the user can't focus and the chat gets noisy.
|
||||
|
||||
## Calling `chart_render` — the canonical pattern
|
||||
|
||||
```
|
||||
chart_render(
|
||||
kind="echarts",
|
||||
spec={
|
||||
"title": {"text": "Q4 revenue by region", "left": "center"},
|
||||
"tooltip": {"trigger": "axis"},
|
||||
"xAxis": {"type": "category", "data": ["NA", "EU", "APAC", "LATAM"]},
|
||||
"yAxis": {"type": "value", "axisLabel": {"formatter": "${value}M"}},
|
||||
"series": [{"type": "bar", "data": [12.4, 8.7, 5.3, 2.1], "itemStyle": {"color": "#db6f02"}}]
|
||||
},
|
||||
title="q4-revenue-by-region",
|
||||
width=1600, height=900, dpi=300
|
||||
)
|
||||
```
|
||||
|
||||
Returns:
|
||||
```
|
||||
{
|
||||
"kind": "echarts",
|
||||
"spec": {...echoed...},
|
||||
"file_path": "/.../charts/2026-04-30T...q4-revenue-by-region.png",
|
||||
"file_url": "file:///.../q4-revenue-by-region.png",
|
||||
"width": 1600, "height": 900, "dpi": 300, "bytes": 142318,
|
||||
"title": "q4-revenue-by-region", "runtime_ms": 287
|
||||
}
|
||||
```
|
||||
|
||||
The chat panel reads `result.spec` and mounts ECharts in the message bubble. The user sees the chart immediately. The PNG is on disk and the chat shows a download link from `result.file_url`. **You don't write that link — it appears automatically.**
|
||||
|
||||
## The 12 chart types you'll use 95% of the time
|
||||
|
||||
| When | ECharts type | Notes |
|
||||
|---|---|---|
|
||||
| Trend over time | `series.type: "line"` | Smooth = `smooth: true` only when data is noisy |
|
||||
| Multi-metric trend | Two `line` series with `yAxis: [{}, {}]` | Separate scales when units differ |
|
||||
| Category comparison | `series.type: "bar"` | Sort by value descending, not alphabetically |
|
||||
| Stacked composition | `bar` with `stack: "total"` | Cap at 5 categories |
|
||||
| Distribution | `series.type: "boxplot"` or `bar` of bins | Boxplot for ≥3 groups; histogram for one |
|
||||
| Two-variable correlation | `series.type: "scatter"` | Add `regression` markline if relevant |
|
||||
| Candlestick / OHLC | `series.type: "candlestick"` | Date axis + `dataZoom` range slider |
|
||||
| Geo distribution | `series.type: "map"` | Bundled `world` and country GeoJSONs |
|
||||
| Hierarchy / share | `series.type: "treemap"` or `sunburst` | Use treemap for >12 leaves; pie only for 2-5 |
|
||||
| Flow | `series.type: "sankey"` | Names matter — keep them short |
|
||||
| Calendar density | `series.type: "heatmap"` + `calendar` | Daily metrics over a year |
|
||||
| KPI scorecard | `series.type: "gauge"` | Set `min`, `max`, threshold band |
|
||||
|
||||
Worked specs for each are in `references/` — paste, modify, render.
|
||||
|
||||
## Mermaid quick rules
|
||||
|
||||
```
|
||||
chart_render(
|
||||
kind="mermaid",
|
||||
spec="""
|
||||
flowchart LR
|
||||
A[Customer signs up] --> B{Onboarded?}
|
||||
B -- yes --> C[Activate trial]
|
||||
B -- no --> D[Email reminder]
|
||||
""",
|
||||
title="signup-flow"
|
||||
)
|
||||
```
|
||||
|
||||
- One diagram per chart_render call.
|
||||
- Keep node labels short (≤20 chars).
|
||||
- Use `flowchart LR` for left-to-right; `TD` for top-down. LR reads better in a chat bubble.
|
||||
- For sequence diagrams, indicate async with `->>` (open arrow) and sync return with `-->>` (dashed).
|
||||
- Don't try to encode data in mermaid (no widths, no quantities) — that's an ECharts job.
|
||||
|
||||
## Common mistakes the agent makes
|
||||
|
||||
1. **Writing `` markdown by hand.** Don't. The chat renders from the tool result automatically. Manual image markdown will display nothing (file:// is blocked from arbitrary chat content).
|
||||
2. **Calling chart_render twice for the same chart "to embed and to save".** Only one call. The single call does both.
|
||||
3. **Overriding fonts to fancy display faces.** Stay with the default; the agent's job is data, not typography.
|
||||
4. **Pie charts with 12 slices.** Use a horizontal bar chart sorted by value. Pie is only for 2-5 mutually-exclusive shares.
|
||||
5. **Forgetting `axisLabel.formatter` for currency / percentage.** A y-axis showing "12000000" is unreadable; "12M" is correct.
|
||||
6. **Putting a chart's title in the message body.** Set `option.title.text` instead so the title is part of the saved PNG.
|
||||
@@ -0,0 +1,139 @@
|
||||
---
|
||||
name: hive.terminal-tools-foundations
|
||||
description: Required reading whenever any shell_* tool is available. Teaches the foreground/background dichotomy (terminal_exec auto-promotes past 30s, returns a job_id you poll with terminal_job_logs), the standard envelope shape (exit_code, stdout, stdout_truncated_bytes, output_handle, semantic_status, warning, auto_backgrounded, job_id), output handle pagination via terminal_output_get, when to read semantic_status instead of raw exit_code (grep/rg/find/diff/test exit 1 is NOT an error), the destructive-warning surface (rm -rf, git push --force, DROP TABLE), tool preference (use files-tools / gcu-tools / hive_tools before raw shell), and the bash-only-on-macOS policy. Skipping this leads to "tool returned no output" surprises, orphaned jobs, and panic over benign grep exit codes.
|
||||
metadata:
|
||||
author: hive
|
||||
type: preset-skill
|
||||
version: "1.0"
|
||||
---
|
||||
|
||||
# terminal-tools — foundations
|
||||
|
||||
These tools give you a real terminal: foreground exec with smart envelopes, background jobs with offset-based log streaming, persistent PTY shells, and filesystem search. Bash-only on POSIX.
|
||||
|
||||
## Tool preference (read first)
|
||||
|
||||
Before reaching for terminal-tools, check whether a higher-level tool already covers the task. Shell is for system operations the other servers don't reach.
|
||||
|
||||
- **Reading files** → `files-tools.read_file` (handles size, paging, line-numbered output) — NOT `terminal_exec("cat ...")`
|
||||
- **Editing files** → `files-tools.edit_file` (atomic patch with diff verification) — NOT `terminal_exec("sed -i ...")`
|
||||
- **Writing files** → `files-tools.write_file` — NOT `terminal_exec("echo > ...")`
|
||||
- **In-project search** → `files-tools.search_files` (project-scoped, code-aware) — use `terminal_rg` only for raw paths outside the project (`/var/log`, `/etc`)
|
||||
- **Browser / web pages** → `gcu-tools.browser_*` for rendered pages — NOT `terminal_exec("curl ...")`
|
||||
- **Web search** → `hive_tools.web_search` — NOT scraping
|
||||
- **System operations** (process exec, jobs, PTYs, raw fs search) → terminal-tools. This is its territory.
|
||||
|
||||
## The standard envelope
|
||||
|
||||
Every spawn-style call (`terminal_exec`, the auto-promoted job state) returns this shape:
|
||||
|
||||
```jsonc
|
||||
{
|
||||
"exit_code": 0, // null when auto-backgrounded or pre-spawn error
|
||||
"stdout": "...", // decoded, truncated to max_output_kb (default 256 KB)
|
||||
"stderr": "...",
|
||||
"stdout_truncated_bytes": 0, // > 0 means more is in output_handle
|
||||
"stderr_truncated_bytes": 0,
|
||||
"runtime_ms": 42,
|
||||
"pid": 12345,
|
||||
"output_handle": null, // "out_<hex>" when truncated — paginate with terminal_output_get
|
||||
"timed_out": false,
|
||||
"semantic_status": "ok", // "ok" | "signal" | "error" — read THIS, not just exit_code
|
||||
"semantic_message": null, // e.g. "No matches found" for grep exit 1
|
||||
"warning": null, // e.g. "may force-remove files" for rm -rf
|
||||
"auto_backgrounded": false,
|
||||
"job_id": null // set when auto_backgrounded=true
|
||||
}
|
||||
```
|
||||
|
||||
## Auto-promotion (the core mental model)
|
||||
|
||||
`terminal_exec` runs commands in the foreground until the **auto-background budget** (default 30s) elapses. Past that point, the process is silently transferred to a background job and the call returns immediately with:
|
||||
|
||||
```jsonc
|
||||
{ "auto_backgrounded": true, "exit_code": null, "job_id": "job_<hex>", ... }
|
||||
```
|
||||
|
||||
When you see `auto_backgrounded: true`, **pivot to polling**. The job is still running:
|
||||
|
||||
```
|
||||
terminal_job_logs(job_id, since_offset=0, wait_until_exit=true, wait_timeout_sec=60)
|
||||
→ blocks server-side until the job exits or the timeout, returns logs + status
|
||||
```
|
||||
|
||||
You're not failing — you're freed up to do other work while the long task runs.
|
||||
|
||||
To force pure-foreground (kill on `timeout_sec`), pass `auto_background_after_sec=0`. Use this when you genuinely don't want a background job (small commands where promotion would surprise you).
|
||||
|
||||
## Semantic exit codes — read `semantic_status`, not raw `exit_code`
|
||||
|
||||
Several common commands use exit 1 for legitimate non-error states:
|
||||
|
||||
| Command | exit 0 | exit 1 |
|
||||
|---|---|---|
|
||||
| `grep` / `rg` | matches found | **no matches** (not an error) |
|
||||
| `find` | success | **some dirs unreadable** (informational) |
|
||||
| `diff` | identical | **files differ** (informational) |
|
||||
| `test` / `[` | true | **false** (informational) |
|
||||
|
||||
For these, `semantic_status` will be `"ok"` even when `exit_code == 1`, with `semantic_message` describing why ("No matches found"). For everything else, `semantic_status` defaults to `"ok"` on 0 and `"error"` on nonzero.
|
||||
|
||||
**Rule**: always check `semantic_status` first. Only fall back to `exit_code` when you need the exact number (e.g. distinguishing `make` errors).
|
||||
|
||||
## Destructive warnings — re-read your command
|
||||
|
||||
The envelope's `warning` field is set when the command matches a known destructive pattern (`rm -rf`, `git push --force`, `git reset --hard`, `DROP TABLE`, `kubectl delete`, `terraform destroy`, etc.). The command **still ran** — the warning is informational. Use it as a "did I mean to do that?" prompt before trusting subsequent steps that depend on the side effect.
|
||||
|
||||
If a `warning` appears unexpectedly, stop and verify: was the destructive action intended, or did a path/glob slip in?
|
||||
|
||||
## Output handles — never lose output
|
||||
|
||||
When `stdout_truncated_bytes > 0` or `stderr_truncated_bytes > 0`, the inline output was capped at `max_output_kb` (default 256 KB). The full bytes are stashed under `output_handle` for **5 minutes**. Paginate with:
|
||||
|
||||
```
|
||||
terminal_output_get(output_handle, since_offset=0, max_kb=64)
|
||||
→ { data, offset, next_offset, eof, expired }
|
||||
```
|
||||
|
||||
Track `next_offset` across calls. If `expired: true`, re-run the command (the handle's TTL has lapsed).
|
||||
|
||||
The store has a 64 MB cap with LRU eviction. For huge outputs, prefer `terminal_job_start` + `terminal_job_logs` polling (4 MB ring buffer per stream, infinite total throughput).
|
||||
|
||||
## Bash, not zsh — even on macOS
|
||||
|
||||
`terminal_exec` and `terminal_pty_open` always invoke `/bin/bash`. The user's `$SHELL` is ignored. Explicit `shell="/bin/zsh"` is **rejected** with a clear error. This is a deliberate security stance, not aesthetic — zsh has command/builtin classes (`zmodload`, `=cmd` expansion, `zpty`, `ztcp`, `zf_*`) that bypass bash-shaped checks. The `terminal-tools-pty-sessions` skill explains the implications for PTY sessions specifically.
|
||||
|
||||
`ZDOTDIR` and `ZSH_*` env vars are stripped before exec to prevent zsh dotfiles leaking in. Bash dotfiles still apply when invoked interactively (e.g. PTY sessions use `bash --norc --noprofile` to keep things predictable).
|
||||
|
||||
## Pipelines and complex commands
|
||||
|
||||
Pipes (`|`), redirects (`>`, `<`, `>>`), conditionals (`&&`, `||`, `;`), and globs (`*`, `?`, `[`) are detected automatically. You can pass them with the default `shell=False` and the runtime will transparently route through `/bin/bash -c` and surface `auto_shell: true` in the envelope:
|
||||
|
||||
```
|
||||
terminal_exec("ps aux | sort -k3 -rn | head -40")
|
||||
→ { exit_code: 0, stdout: "...", auto_shell: true, ... }
|
||||
```
|
||||
|
||||
For simple argv commands (no metacharacters) `shell=False` is faster and direct-execs the binary. For commands with shell features but no metacharacters that the detector catches (rare — exotic bash builtins, here-strings), pass `shell=True` explicitly:
|
||||
|
||||
```
|
||||
terminal_exec("set -e; complicated bash logic", shell=True)
|
||||
```
|
||||
|
||||
Quoted strings work either way — the detector uses `shlex.split` which handles `"quoted args with spaces"` correctly.
|
||||
|
||||
## When to use what (cheat sheet)
|
||||
|
||||
| Need | Tool |
|
||||
|---|---|
|
||||
| One-shot command, ≤30s | `terminal_exec` |
|
||||
| One-shot command, might be longer | `terminal_exec` (auto-promotes) |
|
||||
| Long-running job from the start | `terminal_job_start` |
|
||||
| State across calls (cd, env, REPL) | `terminal_pty_open` + `terminal_pty_run` |
|
||||
| Search file contents (raw paths) | `terminal_rg` |
|
||||
| Find files by predicate | `terminal_find` |
|
||||
| Retrieve truncated output | `terminal_output_get` |
|
||||
| Tree / stat / du | `terminal_exec("ls -la"/"stat foo"/"du -sh path")` |
|
||||
| HTTP / DNS / ping / archives | `terminal_exec("curl ..."/"dig ..."/"tar xzf ...")` |
|
||||
|
||||
See `references/exit_codes.md` for the full POSIX + signal-induced + semantic catalog.
|
||||
+50
@@ -0,0 +1,50 @@
|
||||
# Exit code reference
|
||||
|
||||
## POSIX conventions
|
||||
|
||||
| Code | Meaning |
|
||||
|---|---|
|
||||
| 0 | Success |
|
||||
| 1 | General error / catchall |
|
||||
| 2 | Misuse of shell builtins, syntax error |
|
||||
| 126 | Command found but not executable |
|
||||
| 127 | Command not found |
|
||||
| 128 | Invalid argument to `exit` |
|
||||
| 128 + N | Killed by signal N |
|
||||
| 130 | Killed by SIGINT (Ctrl-C) |
|
||||
| 137 | Killed by SIGKILL |
|
||||
| 143 | Killed by SIGTERM |
|
||||
| 255 | Exit status out of range |
|
||||
|
||||
When `exit_code < 0` in the envelope, the process was killed by a signal: `abs(exit_code)` is the signal number (subprocess uses negative codes for signaled exits, separate from the `128 + N` shell convention).
|
||||
|
||||
## Semantic exits — when exit 1 is NOT an error
|
||||
|
||||
terminal-tools encodes these in `semantic_status`. The agent should read `semantic_status` first.
|
||||
|
||||
| Command | Code 0 | Code 1 | Code ≥2 |
|
||||
|---|---|---|---|
|
||||
| `grep` / `rg` / `ripgrep` | matches found | **no matches** (ok) | error |
|
||||
| `find` | success | **some dirs unreadable** (ok) | error |
|
||||
| `diff` | files identical | **files differ** (ok) | error |
|
||||
| `test` / `[` | condition true | **condition false** (ok) | error |
|
||||
|
||||
For any command not in this table, the default convention applies (0 = ok, nonzero = error).
|
||||
|
||||
## When `exit_code` is `null`
|
||||
|
||||
- `auto_backgrounded: true` — the process is still running under a `job_id`. Poll with `terminal_job_logs`.
|
||||
- Pre-spawn error (command not found, exec failed) — see `error` field in the envelope.
|
||||
- `timed_out: true` and the process refused to die — extremely rare; the kernel has the answer.
|
||||
|
||||
## Common signal-induced exits
|
||||
|
||||
| Signal | Number | Subprocess exit | Shell exit | Meaning |
|
||||
|---|---|---|---|---|
|
||||
| SIGHUP | 1 | -1 | 129 | Terminal hangup |
|
||||
| SIGINT | 2 | -2 | 130 | Interrupt (Ctrl-C) |
|
||||
| SIGQUIT | 3 | -3 | 131 | Quit (Ctrl-\\) |
|
||||
| SIGKILL | 9 | -9 | 137 | Forced kill (uncatchable) |
|
||||
| SIGTERM | 15 | -15 | 143 | Polite termination |
|
||||
| SIGSEGV | 11 | -11 | 139 | Segmentation fault |
|
||||
| SIGABRT | 6 | -6 | 134 | Abort (assertion failed, etc.) |
|
||||
@@ -0,0 +1,96 @@
|
||||
---
|
||||
name: hive.terminal-tools-fs-search
|
||||
description: Use terminal_rg / terminal_find when you need raw filesystem search outside the project tree — system configs, /var/log, /etc, archive contents — or when files-tools.search_files is too project-scoped. Teaches the rg vs find vs terminal_exec("ls/du/tree") split, common rg flag combos for code/logs/configs, find predicates for mtime/size/type queries, and the rule that for tree views or single-file stat info you should just use terminal_exec instead of inventing a tool. Read before reaching for raw shell to grep or find anything.
|
||||
metadata:
|
||||
author: hive
|
||||
type: preset-skill
|
||||
version: "1.0"
|
||||
---
|
||||
|
||||
# Filesystem search
|
||||
|
||||
terminal-tools provides two structured search tools: `terminal_rg` (ripgrep for content) and `terminal_find` (find for predicates). Everything else (tree, stat, du) is just `terminal_exec`.
|
||||
|
||||
## When to use what
|
||||
|
||||
| Task | Tool |
|
||||
|---|---|
|
||||
| Find code/text matching a pattern in your **project** | `files-tools.search_files` (project-aware, ranks by relevance) |
|
||||
| Find code/text matching a pattern in `/var/log`, `/etc`, archives, system dirs | `terminal_rg` |
|
||||
| Find files matching name/glob/predicate | `terminal_find` |
|
||||
| List a directory | `terminal_exec("ls -la /path")` |
|
||||
| Tree view | `terminal_exec("tree -L 2 /path")` |
|
||||
| Single-path stat | `terminal_exec("stat /path")` |
|
||||
| Disk usage | `terminal_exec("du -sh /path")` or `terminal_exec("du -h --max-depth=2 /")` |
|
||||
| Count matches across files | `terminal_rg(pattern, count=True via extra_args=["-c"])` |
|
||||
|
||||
## `terminal_rg` — content search
|
||||
|
||||
ripgrep is fast, gitignore-aware, and has a deep flag surface. The structured wrapper exposes the most useful flags directly; `extra_args` covers the rest.
|
||||
|
||||
### Common patterns
|
||||
|
||||
```
|
||||
# All Python files containing "TODO"
|
||||
terminal_rg(pattern="TODO", path=".", type_filter="py")
|
||||
|
||||
# Case-insensitive, with context
|
||||
terminal_rg(pattern="error", path="/var/log", ignore_case=True, context=2)
|
||||
|
||||
# Search hidden files (rg ignores them by default)
|
||||
terminal_rg(pattern="api_key", path="~", hidden=True)
|
||||
|
||||
# Don't respect .gitignore (find files git would ignore)
|
||||
terminal_rg(pattern="generated", path=".", no_ignore=True)
|
||||
|
||||
# Multi-line pattern (e.g., function definitions spanning lines)
|
||||
terminal_rg(pattern=r"def\s+\w+\(.*\n.*\n", path="src", extra_args=["--multiline"])
|
||||
|
||||
# Specific filename glob
|
||||
terminal_rg(pattern="version", path=".", glob="*.toml")
|
||||
```
|
||||
|
||||
### rg flag idioms
|
||||
|
||||
| Flag | Effect |
|
||||
|---|---|
|
||||
| `-tpy` (`type_filter="py"`) | Only Python files |
|
||||
| `-uu` | Don't respect any ignores (incl. `.git/`) |
|
||||
| `--multiline` (`extra_args`) | Allow regex spanning lines |
|
||||
| `--max-count` (`max_count`) | Stop after N matches per file |
|
||||
| `--max-depth` (`max_depth`) | Limit recursion |
|
||||
| `-w` (`extra_args`) | Whole word match |
|
||||
| `-F` (`extra_args`) | Fixed string (no regex) |
|
||||
|
||||
See `references/ripgrep_cheatsheet.md` for the long form.
|
||||
|
||||
## `terminal_find` — predicate search
|
||||
|
||||
`find` excels at "files matching N criteria". The wrapper surfaces the most common predicates; combine via the structured arguments.
|
||||
|
||||
```
|
||||
# All .log files modified in the last 7 days, larger than 1MB
|
||||
terminal_find(path="/var/log", iname="*.log", mtime_days=7, size_kb_min=1024)
|
||||
|
||||
# All directories named ".git" (find Git repos under a tree)
|
||||
terminal_find(path="~/projects", name=".git", type_filter="d")
|
||||
|
||||
# Only the top three levels
|
||||
terminal_find(path="/etc", max_depth=3, type_filter="f")
|
||||
|
||||
# Symlinks
|
||||
terminal_find(path=".", type_filter="l")
|
||||
```
|
||||
|
||||
See `references/find_predicates.md` for combinations not directly exposed.
|
||||
|
||||
## Output truncation
|
||||
|
||||
Both tools return `truncated: true` when their output exceeded the inline cap. For `terminal_rg`, this means matches were dropped (refine the pattern or narrow the path); for `terminal_find`, results past `max_results` (default 1000) are dropped. Tighten predicates rather than raising the cap.
|
||||
|
||||
## Anti-patterns
|
||||
|
||||
- **Don't `terminal_rg` your project tree** — `files-tools.search_files` is project-aware and ranks results.
|
||||
- **Don't reach for `terminal_find` to list one directory** — `terminal_exec("ls -la /path")` is shorter.
|
||||
- **Don't use `terminal_exec("grep ...")`** when `terminal_rg` exists — rg is faster, gitignore-aware, and returns structured matches.
|
||||
- **Don't use `terminal_exec("find ...")`** to invent your own predicate combinations — use `terminal_find` and report missing capabilities.
|
||||
+78
@@ -0,0 +1,78 @@
|
||||
# find predicate reference
|
||||
|
||||
The `terminal_find` wrapper exposes name/iname, type, mtime_days, size bounds, max_depth, max_results. For combinations beyond that, drop to `terminal_exec("find ...")`.
|
||||
|
||||
## Time predicates
|
||||
|
||||
| Need | find predicate |
|
||||
|---|---|
|
||||
| Modified within N days | `-mtime -N` (wrapper: `mtime_days=N`) |
|
||||
| Modified more than N days ago | `-mtime +N` |
|
||||
| Modified exactly N days ago | `-mtime N` |
|
||||
| Accessed within N days | `-atime -N` |
|
||||
| Inode changed within N days | `-ctime -N` |
|
||||
| Modified in last N minutes | `-mmin -N` |
|
||||
| Newer than reference file | `-newer ref` |
|
||||
|
||||
## Size predicates
|
||||
|
||||
| Need | find predicate |
|
||||
|---|---|
|
||||
| Bigger than N kilobytes | `-size +Nk` (wrapper: `size_kb_min`) |
|
||||
| Smaller than N kilobytes | `-size -Nk` (wrapper: `size_kb_max`) |
|
||||
| Exactly N kilobytes | `-size Nk` |
|
||||
| Bigger than N megabytes | `-size +NM` |
|
||||
| Empty files | `-empty` |
|
||||
|
||||
## Type predicates
|
||||
|
||||
| Need | find predicate |
|
||||
|---|---|
|
||||
| Regular file | `-type f` (wrapper: `type_filter="f"`) |
|
||||
| Directory | `-type d` (wrapper: `type_filter="d"`) |
|
||||
| Symlink | `-type l` (wrapper: `type_filter="l"`) |
|
||||
| Block device | `-type b` |
|
||||
| Character device | `-type c` |
|
||||
| FIFO | `-type p` |
|
||||
| Socket | `-type s` |
|
||||
|
||||
## Permission predicates
|
||||
|
||||
| Need | find predicate |
|
||||
|---|---|
|
||||
| Owned by user | `-user alice` |
|
||||
| Owned by group | `-group dev` |
|
||||
| Permission bits exact | `-perm 644` |
|
||||
| Has any of these bits | `-perm /u+x` |
|
||||
| Has all of these bits | `-perm -u+x` |
|
||||
| Readable by current user | `-readable` |
|
||||
| Writable | `-writable` |
|
||||
| Executable | `-executable` |
|
||||
|
||||
## Composing
|
||||
|
||||
`find` evaluates predicates left-to-right with implicit AND. For OR, use `\(`...\` or .
|
||||
|
||||
```
|
||||
# .log OR .txt (drop to terminal_exec for OR)
|
||||
terminal_exec(r"find /path \( -name '*.log' -o -name '*.txt' \) -type f", shell=True)
|
||||
|
||||
# NOT in a directory called node_modules
|
||||
terminal_exec("find . -path '*/node_modules' -prune -o -name '*.js' -print", shell=True)
|
||||
```
|
||||
|
||||
## Actions
|
||||
|
||||
| Need | predicate |
|
||||
|---|---|
|
||||
| Print path (default) | (implicit `-print`) |
|
||||
| Print null-separated | `-print0` (for piping to xargs -0) |
|
||||
| Delete | `-delete` (DANGEROUS — use terminal_exec with explicit confirmation) |
|
||||
| Run command per match | `-exec cmd {} \;` (drop to terminal_exec) |
|
||||
| Run command, batched | `-exec cmd {} +` |
|
||||
|
||||
## When NOT to use find
|
||||
|
||||
- **One directory listing**: `terminal_exec("ls -la /path")`
|
||||
- **Recursive grep**: `terminal_rg`
|
||||
- **Count files**: `terminal_exec("find /path -type f | wc -l")`
|
||||
+70
@@ -0,0 +1,70 @@
|
||||
# ripgrep cheatsheet
|
||||
|
||||
For when the structured `terminal_rg` flags don't cover the case. Pass via `extra_args=[...]`.
|
||||
|
||||
## Filtering
|
||||
|
||||
| Need | Flag |
|
||||
|---|---|
|
||||
| Whole word | `-w` |
|
||||
| Fixed string (no regex) | `-F` |
|
||||
| Match files only (paths, not lines) | `-l` |
|
||||
| Count matches per file | `-c` |
|
||||
| Print only filenames with no matches | `--files-without-match` |
|
||||
| Exclude binary files | (default) |
|
||||
| Include binaries | `--binary` |
|
||||
| Search archives transparently | (rg doesn't — extract first) |
|
||||
|
||||
## Output shape
|
||||
|
||||
| Need | Flag |
|
||||
|---|---|
|
||||
| Show only matched part | `-o` |
|
||||
| Show byte offset of match | `-b` |
|
||||
| No filename prefix | `-N` (or pipe through awk) |
|
||||
| Color always (for piping into a colorizer) | `--color=always` |
|
||||
| JSON output | (the wrapper already uses `--json` internally) |
|
||||
|
||||
## Boundaries
|
||||
|
||||
| Need | Flag |
|
||||
|---|---|
|
||||
| Line-by-line (default) | (default) |
|
||||
| Multi-line regex | `--multiline` (or `-U`) |
|
||||
| Multi-line dotall (`.` matches `\n`) | `--multiline-dotall` |
|
||||
| Crlf line endings | `--crlf` |
|
||||
|
||||
## Path control
|
||||
|
||||
| Need | Flag |
|
||||
|---|---|
|
||||
| Follow symlinks | `-L` |
|
||||
| Don't follow | (default) |
|
||||
| Search hidden | `-.` (also expressed as `hidden=True`) |
|
||||
| Don't respect any ignores | `-uuu` |
|
||||
| Glob include | `-g 'pattern'` (also `glob="..."`) |
|
||||
| Glob exclude | `-g '!pattern'` |
|
||||
|
||||
## Performance
|
||||
|
||||
| Need | Flag |
|
||||
|---|---|
|
||||
| One thread | `-j 1` |
|
||||
| Smaller mmap chunks | `--mmap` (default behavior usually fine) |
|
||||
| Per-file match cap | `-m N` (also `max_count=N`) |
|
||||
|
||||
## Common composed queries
|
||||
|
||||
```
|
||||
# Find unused imports in Python
|
||||
terminal_rg(pattern=r"^import\s+\w+$", path="src", type_filter="py")
|
||||
|
||||
# All TODO/FIXME/XXX with file:line
|
||||
terminal_rg(pattern=r"\b(TODO|FIXME|XXX)\b", path=".", extra_args=["-n"])
|
||||
|
||||
# Functions defined at module top-level
|
||||
terminal_rg(pattern=r"^def\s+\w+", path=".", type_filter="py")
|
||||
|
||||
# Lines that DON'T match a pattern (filtered through awk)
|
||||
# rg can't invert at line level; use terminal_exec with grep -v
|
||||
```
|
||||
@@ -0,0 +1,110 @@
|
||||
---
|
||||
name: hive.terminal-tools-job-control
|
||||
description: Use when launching anything that runs longer than a minute, anything that streams logs, anything you want to keep running while doing other work — or when terminal_exec auto-backgrounded on you and returned a job_id. Teaches the start→poll→wait pattern with terminal_job_logs offset bookkeeping, the `wait_until_exit=True` blocking-poll idiom, the truncated_bytes_dropped resumption signal, the merge_stderr decision, the SIGINT→SIGTERM→SIGKILL escalation ladder via terminal_job_manage, and the hard rule that jobs die when the terminal-tools server restarts. Read before calling terminal_job_start, or right after terminal_exec auto-backgrounded.
|
||||
metadata:
|
||||
author: hive
|
||||
type: preset-skill
|
||||
version: "1.0"
|
||||
---
|
||||
|
||||
# Background job control
|
||||
|
||||
Background jobs are how you do things that take time without blocking your conversation. Three tools cover the surface: `terminal_job_start`, `terminal_job_logs`, `terminal_job_manage`.
|
||||
|
||||
## When to use a job
|
||||
|
||||
- Builds, deploys, long tests
|
||||
- Processes you want to monitor (streaming a log file, a dev server)
|
||||
- Anything that auto-backgrounded from `terminal_exec` (you have a `job_id`; pivot to this skill's idioms)
|
||||
|
||||
For one-shot work expected to finish quickly, `terminal_exec` is simpler. The auto-promotion mechanic in `terminal_exec` is your safety net — start with `terminal_exec`, take over with this skill if needed.
|
||||
|
||||
## Lifecycle
|
||||
|
||||
```
|
||||
terminal_job_start(command, ...)
|
||||
→ { job_id, pid, started_at }
|
||||
|
||||
terminal_job_logs(job_id, since_offset=0, max_bytes=64000)
|
||||
→ { data, offset, next_offset, status: "running"|"exited", exit_code, ... }
|
||||
|
||||
# Repeat with since_offset = previous next_offset until status == "exited"
|
||||
# Or block once with wait_until_exit=True:
|
||||
terminal_job_logs(job_id, since_offset=N, wait_until_exit=True, wait_timeout_sec=60)
|
||||
→ blocks server-side until exit or timeout
|
||||
```
|
||||
|
||||
After exit, the job is retained for inspection (`terminal_job_manage(action="list")`) until evicted by FIFO (50 most recent exits kept).
|
||||
|
||||
## Offset bookkeeping — the only rule that matters
|
||||
|
||||
The job's output lives in a 4 MB ring buffer per stream. Each call to `terminal_job_logs` returns:
|
||||
|
||||
- `data` — bytes between `since_offset` and `next_offset`
|
||||
- `next_offset` — pass this as `since_offset` on your next call
|
||||
- `truncated_bytes_dropped` — non-zero when your `since_offset` was older than the ring's floor (you fell behind)
|
||||
|
||||
**Always carry `next_offset` forward.** Don't replay from 0 — that's an offset reset, you'll see the same data twice and miss the part that fell off.
|
||||
|
||||
When `truncated_bytes_dropped > 0`, the buffer evicted N bytes between your last call and now. Treat it as a signal that the job is producing output faster than you're consuming. Either poll more often or accept the gap and read from `next_offset` going forward.
|
||||
|
||||
## merge_stderr — interleaved or separate
|
||||
|
||||
```
|
||||
merge_stderr=False → two streams, request "stdout" or "stderr" by name
|
||||
merge_stderr=True → one stream ("merged"), order preserved
|
||||
```
|
||||
|
||||
Pick `merge_stderr=True` when:
|
||||
- The job's logs are designed to be read together (most servers, build tools)
|
||||
- You don't need to distinguish "this was stderr"
|
||||
|
||||
Pick `merge_stderr=False` when:
|
||||
- stderr is genuinely error-only and stdout is data
|
||||
- You'll process them differently
|
||||
|
||||
## Signal escalation
|
||||
|
||||
```
|
||||
terminal_job_manage(action="signal_int", job_id=...) # graceful (Ctrl-C-equivalent)
|
||||
terminal_job_manage(action="signal_term", job_id=...) # polite kill (SIGTERM)
|
||||
terminal_job_manage(action="signal_kill", job_id=...) # forced kill (SIGKILL, uncatchable)
|
||||
```
|
||||
|
||||
The idiom: `signal_int` → wait 2-5s → `signal_term` → wait 2-5s → `signal_kill`. Most well-behaved processes handle SIGINT (graceful) and SIGTERM (cleanup, then exit). SIGKILL bypasses cleanup — use only when the process is truly unresponsive.
|
||||
|
||||
After signaling, check exit with `terminal_job_logs(job_id, wait_until_exit=True, wait_timeout_sec=2)`.
|
||||
|
||||
## Stdin
|
||||
|
||||
```
|
||||
terminal_job_manage(action="stdin", job_id=..., data="some input\n")
|
||||
terminal_job_manage(action="close_stdin", job_id=...)
|
||||
```
|
||||
|
||||
For tools that read stdin to EOF, `close_stdin` after writing flushes them. For interactive tools that read line-by-line, just write each line.
|
||||
|
||||
## Take-over: when terminal_exec auto-backgrounds
|
||||
|
||||
When `terminal_exec` returned `auto_backgrounded: true, job_id: <X>`, the process is **already** in the JobManager with its output flowing into the ring buffer. Your transition is seamless:
|
||||
|
||||
```
|
||||
# Already saw the start of output in terminal_exec's stdout/stderr.
|
||||
# Pick up reading where the env left off — use the byte count of the
|
||||
# initial stdout as your since_offset, OR just request tail output:
|
||||
terminal_job_logs(job_id="job_xxx", tail=True, max_bytes=64000)
|
||||
```
|
||||
|
||||
Or block until exit and grab everything:
|
||||
|
||||
```
|
||||
terminal_job_logs(job_id="job_xxx", since_offset=0, wait_until_exit=True, wait_timeout_sec=120)
|
||||
```
|
||||
|
||||
## Hard rules
|
||||
|
||||
- **Jobs die when the server restarts.** The desktop runtime restarts terminal-tools when Hive restarts. There's no re-attach. If you need durability, use `nohup` + `terminal_exec` to detach into the system's process tree and track the PID yourself.
|
||||
- **Server-wide hard cap on concurrent jobs** (`TERMINAL_TOOLS_MAX_JOBS`, default 32). Past the cap, `terminal_job_start` returns an error. Wait for jobs to exit or kill old ones.
|
||||
- **No cross-restart output.** Output handles and ring buffers are in-memory only.
|
||||
|
||||
See `references/signals.md` for the full signal catalog.
|
||||
@@ -0,0 +1,41 @@
|
||||
# Signal reference
|
||||
|
||||
terminal_job_manage exposes six signals via the action name.
|
||||
|
||||
| Action | Signal | Number | Purpose | Catchable? |
|
||||
|---|---|---|---|---|
|
||||
| `signal_int` | SIGINT | 2 | Interrupt — Ctrl-C equivalent. Most CLIs treat as "stop gracefully". | Yes |
|
||||
| `signal_term` | SIGTERM | 15 | Polite termination request. Default for `kill`. | Yes |
|
||||
| `signal_kill` | SIGKILL | 9 | Forced kill. Process can't catch, clean up, or finalize. Use sparingly. | **No** |
|
||||
| `signal_hup` | SIGHUP | 1 | Hangup. Many daemons reload config on this. | Yes |
|
||||
| `signal_usr1` | SIGUSR1 | 10 | User-defined #1. Common: dump state, rotate logs (nginx, etc). | Yes |
|
||||
| `signal_usr2` | SIGUSR2 | 12 | User-defined #2. Common: graceful binary upgrade (unicorn, etc). | Yes |
|
||||
|
||||
## Escalation idiom
|
||||
|
||||
```
|
||||
1. signal_int (Ctrl-C — graceful)
|
||||
2. wait 2-5s, check status with terminal_job_logs(wait_until_exit=True, wait_timeout_sec=3)
|
||||
3. if still running: signal_term (cleanup-then-exit)
|
||||
4. wait 2-5s
|
||||
5. if still running: signal_kill (forced)
|
||||
```
|
||||
|
||||
The waits matter: SIGTERM handlers do real work (flush logs, close DBs, release locks) and need time. Skipping straight to SIGKILL leaks resources.
|
||||
|
||||
## When to use SIGUSR1 / SIGUSR2
|
||||
|
||||
These are application-defined. Read the target's docs first. Common:
|
||||
- **nginx**: SIGUSR1 → reopen log files (for log rotation)
|
||||
- **unicorn / puma**: SIGUSR2 → fork a new master with the latest binary (graceful restart)
|
||||
- **rsync**: SIGUSR1 → print stats so far
|
||||
|
||||
## Reading exit codes after a signal
|
||||
|
||||
When a job exits via signal, `terminal_job_logs` returns `exit_code: -N` (subprocess convention) where `abs(N)` is the signal number. The shell convention `128 + N` doesn't apply to the JobManager — that's for shell-spawned children.
|
||||
|
||||
| exit_code | Means |
|
||||
|---|---|
|
||||
| -2 | Killed by SIGINT |
|
||||
| -9 | Killed by SIGKILL |
|
||||
| -15 | Killed by SIGTERM |
|
||||
@@ -0,0 +1,127 @@
|
||||
---
|
||||
name: hive.terminal-tools-pty-sessions
|
||||
description: Use when you need state across calls — building env vars, navigating with cd, driving REPLs (python -i, mysql, psql, node), or responding to interactive prompts (sudo password, ssh host-key confirmation, mysql connection). Teaches the prompt-sentinel exec pattern (default mode), raw I/O for REPLs (raw_send=True then read_only=True), the one-in-flight-per-session rule, and the close-or-leak-against-the-cap discipline. Bash on macOS — never zsh; explicit shell=/bin/zsh is rejected. Read before calling terminal_pty_open.
|
||||
metadata:
|
||||
author: hive
|
||||
type: preset-skill
|
||||
version: "1.0"
|
||||
---
|
||||
|
||||
# Persistent PTY sessions
|
||||
|
||||
PTY sessions are how you talk to interactive programs — programs that detect a terminal (`isatty()`) and behave differently when they don't see one. Use a session when:
|
||||
|
||||
- You need state to persist across calls (`cd`, env vars, sourced scripts)
|
||||
- You're driving a REPL (`python -i`, `mysql`, `psql`, `node`, `irb`)
|
||||
- A program demands an interactive prompt (`sudo`, `ssh`, `npm login`, `gh auth login`)
|
||||
|
||||
For everything else, `terminal_exec` is simpler. Sessions cost more (per-session bash process, ring buffer, idle-reaping bookkeeping) and have a hard cap (`TERMINAL_TOOLS_MAX_PTY`, default 8).
|
||||
|
||||
## Why PTY (and not subprocess pipes)
|
||||
|
||||
Subprocess pipes break on every interactive program. The moment a program calls `isatty()` and sees False, it disables prompts, color, line-editing, password masking, progress bars — sometimes refuses to start. PTY makes us look like a real terminal so these programs work the same as in your shell.
|
||||
|
||||
The cost: PTY output includes terminal escape codes (cursor moves, color codes). The session captures them as-is; if you need clean text, strip ANSI escapes in your processing layer.
|
||||
|
||||
## Bash on macOS — by deliberate policy
|
||||
|
||||
`terminal_pty_open` always invokes `/bin/bash`, regardless of the user's `$SHELL`. macOS users: yes, even when zsh is your interactive default. This is the **terminal-tools-foundations** policy applied to PTYs.
|
||||
|
||||
Reasons:
|
||||
- zsh has command/builtin classes (`zmodload`, `=cmd` expansion, `zpty`, `ztcp`) that bypass bash-shaped security checks
|
||||
- One shell behavior across platforms eliminates "works on Linux, breaks on macOS" surprises
|
||||
- Bash is universal: any shell you've used will accept the bash subset
|
||||
|
||||
The bash invocation uses `--norc --noprofile` so user dotfiles don't leak in. PS1 is set to a unique sentinel for prompt detection. PS2 is empty. PROMPT_COMMAND is empty.
|
||||
|
||||
## Three modes of `terminal_pty_run`
|
||||
|
||||
### 1. Default: send command, wait for prompt sentinel
|
||||
|
||||
```
|
||||
terminal_pty_run(session_id, command="ls -la")
|
||||
→ { output, prompt_after: True, ... }
|
||||
```
|
||||
|
||||
The session writes `ls -la\n`, waits for the sentinel that its custom PS1 emits, returns the slice between submission and prompt. **One in-flight call per session** — a concurrent call returns a `"session busy"` error.
|
||||
|
||||
### 2. raw_send: send raw input, no waiting
|
||||
|
||||
```
|
||||
terminal_pty_run(session_id, command="print('hi')\n", raw_send=True)
|
||||
→ { bytes_sent: 12 }
|
||||
```
|
||||
|
||||
For REPLs, vim keystrokes, password prompts. The session writes the bytes and returns immediately — it doesn't wait for a prompt (REPLs don't print bash's prompt; they print their own).
|
||||
|
||||
After a `raw_send`, you typically follow with:
|
||||
|
||||
### 3. read_only: drain currently-buffered output
|
||||
|
||||
```
|
||||
terminal_pty_run(session_id, read_only=True, timeout_sec=2)
|
||||
→ { output: "hi\n", more: False, ... }
|
||||
```
|
||||
|
||||
Reads whatever the session has accumulated since the last drain, with a brief settle window. Use after raw_send to capture the REPL's response.
|
||||
|
||||
## Custom prompt detection (`expect`)
|
||||
|
||||
When the command launches a program with its own prompt (Python REPL's `>>> `, mysql's `mysql> `, sudo's password prompt), the bash sentinel won't appear until the program exits. Override:
|
||||
|
||||
```
|
||||
terminal_pty_run(session_id, command="python3", expect=r">>>\s*$", timeout_sec=10)
|
||||
→ output up to and including ">>>", then control returns
|
||||
```
|
||||
|
||||
For sudo:
|
||||
|
||||
```
|
||||
terminal_pty_run(session_id, command="sudo -k && sudo whoami", expect=r"[Pp]assword:")
|
||||
terminal_pty_run(session_id, command="<password>", raw_send=True, command="<password>\n")
|
||||
terminal_pty_run(session_id, read_only=True, timeout_sec=5)
|
||||
```
|
||||
|
||||
(Treat passwords carefully — they end up in the ring buffer.)
|
||||
|
||||
## Always close
|
||||
|
||||
```
|
||||
terminal_pty_close(session_id)
|
||||
```
|
||||
|
||||
Leaked sessions count against `TERMINAL_TOOLS_MAX_PTY` (default 8). Idle reaping happens lazily on every `_open` call (sessions inactive longer than `idle_timeout_sec`, default 1800s, are dropped) — but don't rely on it. Close when you're done.
|
||||
|
||||
For unresponsive sessions, `force=True` skips the graceful "exit" attempt and goes straight to SIGTERM/SIGKILL.
|
||||
|
||||
## Common patterns
|
||||
|
||||
### Stateful navigation
|
||||
|
||||
```
|
||||
sid = terminal_pty_open(cwd="/")
|
||||
terminal_pty_run(sid, command="cd /var/log")
|
||||
terminal_pty_run(sid, command="ls -la *.log | head")
|
||||
terminal_pty_close(sid)
|
||||
```
|
||||
|
||||
### Python REPL
|
||||
|
||||
```
|
||||
sid = terminal_pty_open()
|
||||
terminal_pty_run(sid, command="python3", expect=r">>>\s*$")
|
||||
terminal_pty_run(sid, command="x = 42", raw_send=True)
|
||||
terminal_pty_run(sid, command="print(x*x)\n", raw_send=True)
|
||||
result = terminal_pty_run(sid, read_only=True) # → "1764\n>>> "
|
||||
terminal_pty_run(sid, command="exit()", raw_send=True)
|
||||
terminal_pty_close(sid)
|
||||
```
|
||||
|
||||
### ssh with host-key prompt
|
||||
|
||||
```
|
||||
sid = terminal_pty_open()
|
||||
terminal_pty_run(sid, command="ssh user@new-host", expect=r"\(yes/no.*\)\?")
|
||||
terminal_pty_run(sid, command="yes\n", raw_send=True)
|
||||
terminal_pty_run(sid, read_only=True, timeout_sec=10) # password prompt or login
|
||||
```
|
||||
@@ -0,0 +1,92 @@
|
||||
---
|
||||
name: hive.terminal-tools-troubleshooting
|
||||
description: Read when a terminal-tools call returned something surprising — empty stdout despite no error, exit_code is null, output_handle came back expired, "too many jobs" / "session busy" / "too many PTYs", warning was set unexpectedly, semantic_status disagrees with exit_code. Diagnostic recipes only — load on demand. Don't preload; the foundational skill covers the happy path.
|
||||
metadata:
|
||||
author: hive
|
||||
type: preset-skill
|
||||
version: "1.0"
|
||||
---
|
||||
|
||||
# Troubleshooting terminal-tools
|
||||
|
||||
Recipes for surprising results. Match the symptom to the section.
|
||||
|
||||
## Empty `stdout` despite the command "should have" produced output
|
||||
|
||||
Possible causes:
|
||||
1. Output went to **stderr** instead. Check `stderr` in the envelope (or use `merge_stderr=True` for jobs).
|
||||
2. Output was **fully truncated** because `max_output_kb` is too small. Check `stdout_truncated_bytes > 0`. Bump `max_output_kb` or paginate via `output_handle`.
|
||||
3. Command produced no output (correct, just unexpected — `silent` flags, no matches).
|
||||
4. Pipeline issue: the last stage of a pipe ran but stdout went elsewhere (`> /dev/null`, redirected via `2>&1`).
|
||||
5. Process is buffering its output and didn't flush before exit. Add `stdbuf -oL` (line-buffered) or `unbuffer` to the command.
|
||||
|
||||
## `exit_code: null`
|
||||
|
||||
| Cause | Other field |
|
||||
|---|---|
|
||||
| Auto-backgrounded | `auto_backgrounded: true, job_id: <X>` |
|
||||
| Hard timeout, process killed | `timed_out: true` |
|
||||
| Pre-spawn failure (command not found) | `error: ...` set, `pid: null` |
|
||||
| Still running (in `terminal_job_logs`) | `status: "running"` |
|
||||
|
||||
## `output_handle` returned `expired: true`
|
||||
|
||||
5-minute TTL. Either (a) you waited too long, or (b) the store evicted it under memory pressure (64 MB total cap, LRU eviction). Re-run the command.
|
||||
|
||||
To reduce risk: paginate the handle as soon as you receive it, or use `terminal_job_*` for huge outputs (4 MB ring buffer with offsets — no expiry).
|
||||
|
||||
## "too many jobs" / `JobLimitExceeded`
|
||||
|
||||
`TERMINAL_TOOLS_MAX_JOBS` (default 32) hit. Either:
|
||||
- Wait for jobs to exit (poll with `terminal_job_logs(wait_until_exit=True)`)
|
||||
- Kill old jobs: `terminal_job_manage(action="list")` to see what's running, then `signal_term` the abandoned ones
|
||||
- Raise the cap via env (rare)
|
||||
|
||||
## "session busy"
|
||||
|
||||
A `terminal_pty_run` was issued while another `_run` is in flight on the same session. PTY sessions are single-threaded conversations. Wait for the prior call to return, or open a second session.
|
||||
|
||||
## "PTY cap reached"
|
||||
|
||||
`TERMINAL_TOOLS_MAX_PTY` (default 8) hit. Close idle sessions (`terminal_pty_close`). Idle reaping is lazy; force it by opening — no, actually, opening throws when the cap is hit. Just close manually.
|
||||
|
||||
## `warning` is set, the command worked
|
||||
|
||||
Informational only. The pattern matched (e.g. `rm -rf` literally appears, or `git push --force` was used). The command ran. The warning is your "did I mean to do that?" prompt — verify the side effect was intended before continuing.
|
||||
|
||||
## `semantic_status: "ok"` but `exit_code: 1`
|
||||
|
||||
Working as designed. Some commands use exit 1 for legitimate non-error states:
|
||||
- `grep` / `rg` exit 1 when **no matches** found
|
||||
- `find` exit 1 when **some directories were unreadable** (typical on `/proc`, etc.)
|
||||
- `diff` exit 1 when **files differ**
|
||||
- `test` / `[` exit 1 when **condition is false**
|
||||
|
||||
The `semantic_message` field explains. Trust `semantic_status`, not raw `exit_code`.
|
||||
|
||||
## `semantic_status: "error"` but `exit_code: 0`
|
||||
|
||||
Shouldn't happen. If it does, file a bug.
|
||||
|
||||
## `truncated_bytes_dropped > 0` in `terminal_job_logs`
|
||||
|
||||
Your `since_offset` was older than the ring buffer's floor — bytes evicted before you could read them. Either:
|
||||
- Poll faster (lower latency between calls)
|
||||
- Use `merge_stderr=True` (single 4 MB ring instead of 4 MB × 2)
|
||||
- Accept the gap and move forward from `next_offset`
|
||||
|
||||
## `terminal_pty_open` succeeds but the first `_run` times out
|
||||
|
||||
The session may not have produced its first prompt sentinel within the 2-second startup window. Try:
|
||||
- A `terminal_pty_run(sid, read_only=True, timeout_sec=2)` to drain whatever's accumulated
|
||||
- A noop command (`terminal_pty_run(sid, command="true")`) to force a prompt cycle
|
||||
|
||||
Could also indicate the bash process died at startup — `terminal_pty_run(sid, ...)` would then return `"session has exited"`.
|
||||
|
||||
## `shell="/bin/zsh"` returned an error
|
||||
|
||||
By design. terminal-tools is bash-only on POSIX. Use `shell=True` (default `/bin/bash`) or omit `shell=` to exec directly.
|
||||
|
||||
## A command in `shell=True` is interpreted differently than expected
|
||||
|
||||
Bash, not zsh, semantics. `**/*` doesn't recurse without `shopt -s globstar`; `=cmd` expansion doesn't work; arrays use `arr[idx]` not `${arr[idx]}` differently than zsh. When in doubt, the foundational skill's "bash, not zsh" section is the canonical statement.
|
||||
@@ -0,0 +1,203 @@
|
||||
"""Shared skill authoring primitives.
|
||||
|
||||
Validates and materializes a skill folder. Used by three callers:
|
||||
|
||||
1. Queen's ``create_colony`` tool (``queen_lifecycle_tools.py``) — inline
|
||||
content passed by the queen during colony creation.
|
||||
2. HTTP POST / PUT routes under ``/api/**/skills`` — UI-driven creation.
|
||||
3. Future ``create_learned_skill`` tool — runtime learning.
|
||||
|
||||
Keeping the validators and writer here ensures the three paths share one
|
||||
authority; changes to the name regex or frontmatter layout happen in one
|
||||
place.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import shutil
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Framework skill names include dots (``hive.note-taking``), so the
|
||||
# validator needs to allow them even though the queen's ``create_colony``
|
||||
# tool historically forbade dots. User-created skills without dots still
|
||||
# pass; the dot cap just prevents us from rejecting existing framework
|
||||
# names when the UI toggles them via ``validate_skill_name``.
|
||||
_SKILL_NAME_RE = re.compile(r"^[a-z0-9.-]+$")
|
||||
_MAX_NAME_LEN = 64
|
||||
_MAX_DESC_LEN = 1024
|
||||
|
||||
|
||||
@dataclass
|
||||
class SkillFile:
|
||||
"""Supporting file bundled with a skill (relative path + content)."""
|
||||
|
||||
rel_path: Path
|
||||
content: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class SkillDraft:
|
||||
"""Validated skill content ready to be written to disk."""
|
||||
|
||||
name: str
|
||||
description: str
|
||||
body: str
|
||||
files: list[SkillFile] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def skill_md_text(self) -> str:
|
||||
"""Assemble the final SKILL.md text (frontmatter + body)."""
|
||||
body_norm = self.body.rstrip() + "\n"
|
||||
return f"---\nname: {self.name}\ndescription: {self.description}\n---\n\n{body_norm}"
|
||||
|
||||
|
||||
def validate_skill_name(raw: str) -> tuple[str | None, str | None]:
|
||||
"""Return ``(normalized_name, error)``. Either side may be None."""
|
||||
name = (raw or "").strip() if isinstance(raw, str) else ""
|
||||
if not name:
|
||||
return None, "skill_name is required"
|
||||
if not _SKILL_NAME_RE.match(name):
|
||||
return None, f"skill_name '{name}' must match [a-z0-9-] pattern"
|
||||
if name.startswith("-") or name.endswith("-") or "--" in name:
|
||||
return None, f"skill_name '{name}' has leading/trailing/consecutive hyphens"
|
||||
if len(name) > _MAX_NAME_LEN:
|
||||
return None, f"skill_name '{name}' exceeds {_MAX_NAME_LEN} chars"
|
||||
return name, None
|
||||
|
||||
|
||||
def validate_description(raw: str) -> tuple[str | None, str | None]:
|
||||
desc = (raw or "").strip() if isinstance(raw, str) else ""
|
||||
if not desc:
|
||||
return None, "skill_description is required"
|
||||
if len(desc) > _MAX_DESC_LEN:
|
||||
return None, f"skill_description must be 1–{_MAX_DESC_LEN} chars"
|
||||
# Frontmatter descriptions are line-oriented — the parser reads one value.
|
||||
if "\n" in desc or "\r" in desc:
|
||||
return None, "skill_description must be a single line (no newlines)"
|
||||
return desc, None
|
||||
|
||||
|
||||
def validate_files(raw: list[dict] | None) -> tuple[list[SkillFile] | None, str | None]:
|
||||
if not raw:
|
||||
return [], None
|
||||
if not isinstance(raw, list):
|
||||
return None, "skill_files must be an array"
|
||||
out: list[SkillFile] = []
|
||||
for entry in raw:
|
||||
if not isinstance(entry, dict):
|
||||
return None, "each skill_files entry must be an object with 'path' and 'content'"
|
||||
rel_raw = entry.get("path")
|
||||
content = entry.get("content")
|
||||
if not isinstance(rel_raw, str) or not rel_raw.strip():
|
||||
return None, "skill_files entry missing non-empty 'path'"
|
||||
if not isinstance(content, str):
|
||||
return None, f"skill_files entry '{rel_raw}' missing string 'content'"
|
||||
rel_stripped = rel_raw.strip()
|
||||
# Allow './foo' but reject '/foo' — relativizing absolute paths silently
|
||||
# has bitten other tools; make the intent loud instead.
|
||||
if rel_stripped.startswith("./"):
|
||||
rel_stripped = rel_stripped[2:]
|
||||
rel_path = Path(rel_stripped)
|
||||
if rel_stripped.startswith("/") or rel_path.is_absolute() or ".." in rel_path.parts:
|
||||
return None, f"skill_files path '{rel_raw}' must be relative and inside the skill folder"
|
||||
if rel_path.as_posix() == "SKILL.md":
|
||||
return None, "skill_files must not contain SKILL.md — pass skill_body instead"
|
||||
out.append(SkillFile(rel_path=rel_path, content=content))
|
||||
return out, None
|
||||
|
||||
|
||||
def build_draft(
|
||||
*,
|
||||
skill_name: str,
|
||||
skill_description: str,
|
||||
skill_body: str,
|
||||
skill_files: list[dict] | None = None,
|
||||
) -> tuple[SkillDraft | None, str | None]:
|
||||
"""Validate all inputs and return an immutable draft ready for writing."""
|
||||
name, err = validate_skill_name(skill_name)
|
||||
if err or name is None:
|
||||
return None, err
|
||||
desc, err = validate_description(skill_description)
|
||||
if err or desc is None:
|
||||
return None, err
|
||||
body = skill_body if isinstance(skill_body, str) else ""
|
||||
if not body.strip():
|
||||
return None, (
|
||||
"skill_body is required — the operational procedure the colony worker needs to run this job unattended"
|
||||
)
|
||||
files, err = validate_files(skill_files)
|
||||
if err or files is None:
|
||||
return None, err
|
||||
return SkillDraft(name=name, description=desc, body=body, files=list(files)), None
|
||||
|
||||
|
||||
def write_skill(
|
||||
draft: SkillDraft,
|
||||
*,
|
||||
target_root: Path,
|
||||
replace_existing: bool = True,
|
||||
) -> tuple[Path | None, str | None, bool]:
|
||||
"""Write the draft under ``target_root/{draft.name}/``.
|
||||
|
||||
``target_root`` is the parent scope dir (e.g.
|
||||
``~/.hive/agents/queens/{id}/skills`` or
|
||||
``{colony_dir}/skills``). The function creates it if needed.
|
||||
|
||||
Returns ``(installed_path, error, replaced)``. On success ``error`` is
|
||||
``None``; on failure ``installed_path`` is ``None`` and the target is
|
||||
left as it was before the call (best-effort).
|
||||
|
||||
When ``replace_existing=False`` and the target dir already exists,
|
||||
the write is refused with a non-fatal error (caller decides whether
|
||||
to surface it as a 409 or a warning).
|
||||
"""
|
||||
try:
|
||||
target_root.mkdir(parents=True, exist_ok=True)
|
||||
except OSError as e:
|
||||
return None, f"failed to create skills root: {e}", False
|
||||
|
||||
target = target_root / draft.name
|
||||
replaced = False
|
||||
try:
|
||||
if target.exists():
|
||||
if not replace_existing:
|
||||
return None, f"skill '{draft.name}' already exists", False
|
||||
# Remove the old dir outright so stale files from a prior
|
||||
# version don't linger alongside the new ones.
|
||||
replaced = True
|
||||
shutil.rmtree(target)
|
||||
target.mkdir(parents=True, exist_ok=False)
|
||||
(target / "SKILL.md").write_text(draft.skill_md_text, encoding="utf-8")
|
||||
for sf in draft.files:
|
||||
full_path = target / sf.rel_path
|
||||
full_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
full_path.write_text(sf.content, encoding="utf-8")
|
||||
except OSError as e:
|
||||
return None, f"failed to write skill folder {target}: {e}", replaced
|
||||
return target, None, replaced
|
||||
|
||||
|
||||
def remove_skill(target_root: Path, skill_name: str) -> tuple[bool, str | None]:
|
||||
"""Rm-tree the skill directory under ``target_root/{skill_name}/``.
|
||||
|
||||
Returns ``(removed, error)``. ``removed=False, error=None`` means
|
||||
the directory didn't exist (idempotent). Name is validated on the
|
||||
way in so an attacker with UI access can't traverse out of the
|
||||
scope root.
|
||||
"""
|
||||
name, err = validate_skill_name(skill_name)
|
||||
if err or name is None:
|
||||
return False, err
|
||||
target = target_root / name
|
||||
if not target.exists():
|
||||
return False, None
|
||||
try:
|
||||
shutil.rmtree(target)
|
||||
except OSError as e:
|
||||
return False, f"failed to remove skill folder {target}: {e}"
|
||||
return True, None
|
||||
@@ -34,12 +34,8 @@ Constraints: never read more than one skill up front; only read after selecting.
|
||||
- When a skill drives external API writes (Gmail, Calendar, GitHub, etc.),
|
||||
assume rate limits: prefer fewer larger writes, avoid tight one-item loops,
|
||||
serialize bursts when possible, and respect 429/Retry-After.
|
||||
|
||||
|
||||
The following skills provide specialized instructions for specific tasks.
|
||||
Use `read_file` to load a skill's SKILL.md when the task matches its description.
|
||||
When a skill file references a relative path, resolve it against the
|
||||
skill directory (parent of SKILL.md) and use that absolute path in tool commands."""
|
||||
- When a selected skill references a relative path, resolve it against the
|
||||
skill directory (parent of SKILL.md) and use that absolute path in tool commands."""
|
||||
|
||||
_MANDATORY_HEADER_COMPACT = """## Skills (mandatory)
|
||||
Before replying: scan <available_skills> <name> entries.
|
||||
@@ -50,12 +46,8 @@ Constraints: never read more than one skill up front; only read after selecting.
|
||||
- When a skill drives external API writes (Gmail, Calendar, GitHub, etc.),
|
||||
assume rate limits: prefer fewer larger writes, avoid tight one-item loops,
|
||||
serialize bursts when possible, and respect 429/Retry-After.
|
||||
|
||||
|
||||
The following skills provide specialized instructions for specific tasks.
|
||||
Use `read_file` to load a skill's SKILL.md when the task matches its name.
|
||||
When a skill file references a relative path, resolve it against the
|
||||
skill directory (parent of SKILL.md) and use that absolute path in tool commands."""
|
||||
- When a selected skill references a relative path, resolve it against the
|
||||
skill directory (parent of SKILL.md) and use that absolute path in tool commands."""
|
||||
|
||||
|
||||
class SkillCatalog:
|
||||
|
||||
@@ -7,7 +7,7 @@ locations. Resolves name collisions deterministically.
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
from framework.skills.parser import ParsedSkill, parse_skill_md
|
||||
@@ -30,16 +30,40 @@ _SKIP_DIRS = frozenset(
|
||||
)
|
||||
|
||||
# Scope priority (higher = takes precedence)
|
||||
# ``preset`` sits between framework and user: bundled alongside the
|
||||
# framework distribution, but off by default — capability packs the user
|
||||
# opts into per queen/colony rather than globally-enabled infra.
|
||||
_SCOPE_PRIORITY = {
|
||||
"framework": 0,
|
||||
"user": 1,
|
||||
"project": 2,
|
||||
"preset": 1,
|
||||
"user": 2,
|
||||
"queen_ui": 3,
|
||||
"colony_ui": 4,
|
||||
"project": 5,
|
||||
}
|
||||
|
||||
# Within the same scope, Hive-specific paths override cross-client paths.
|
||||
# We encode this by scanning cross-client first, then Hive-specific (later wins).
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtraScope:
|
||||
"""Additional scope dir to scan beyond the standard five.
|
||||
|
||||
Used by :class:`framework.skills.manager.SkillsManager` to surface
|
||||
per-queen (``queen_ui``) and per-colony (``colony_ui``) skill
|
||||
directories created through the UI. The ``label`` feeds
|
||||
:attr:`ParsedSkill.source_scope` so downstream consumers (trust
|
||||
gate, UI provenance resolver) can distinguish scope origins.
|
||||
"""
|
||||
|
||||
directory: Path
|
||||
label: str
|
||||
# Kept for forward-compat with the priority table; discovery itself
|
||||
# relies on scan order for last-wins resolution.
|
||||
priority: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiscoveryConfig:
|
||||
"""Configuration for skill discovery."""
|
||||
@@ -49,6 +73,10 @@ class DiscoveryConfig:
|
||||
skip_framework_scope: bool = False
|
||||
max_depth: int = 4
|
||||
max_dirs: int = 2000
|
||||
# Additional scope dirs scanned between user and project scopes,
|
||||
# in the order they are provided. Use ``ExtraScope`` to tag each
|
||||
# with its logical label (``queen_ui`` / ``colony_ui``).
|
||||
extra_scopes: list[ExtraScope] = field(default_factory=list)
|
||||
|
||||
|
||||
class SkillDiscovery:
|
||||
@@ -82,13 +110,22 @@ class SkillDiscovery:
|
||||
all_skills: list[ParsedSkill] = []
|
||||
self._scanned_dirs = []
|
||||
|
||||
# Framework scope (lowest precedence)
|
||||
# Framework scope (lowest precedence) — always-on infra skills.
|
||||
if not self._config.skip_framework_scope:
|
||||
framework_dir = Path(__file__).parent / "_default_skills"
|
||||
if framework_dir.is_dir():
|
||||
self._scanned_dirs.append(framework_dir)
|
||||
all_skills.extend(self._scan_scope(framework_dir, "framework"))
|
||||
|
||||
# Preset scope — bundled capability packs that ship with the
|
||||
# framework but default to OFF. User opts in per queen/colony
|
||||
# via the Skills Library. ``skip_framework_scope`` covers both
|
||||
# bundled directories since they live side-by-side on disk.
|
||||
preset_dir = Path(__file__).parent / "_preset_skills"
|
||||
if preset_dir.is_dir():
|
||||
self._scanned_dirs.append(preset_dir)
|
||||
all_skills.extend(self._scan_scope(preset_dir, "preset"))
|
||||
|
||||
# User scope
|
||||
if not self._config.skip_user_scope:
|
||||
home = Path.home()
|
||||
@@ -99,12 +136,23 @@ class SkillDiscovery:
|
||||
self._scanned_dirs.append(user_agents)
|
||||
all_skills.extend(self._scan_scope(user_agents, "user"))
|
||||
|
||||
# Hive-specific (higher precedence within user scope)
|
||||
user_hive = home / ".hive" / "skills"
|
||||
# Hive-specific (higher precedence within user scope). Honors
|
||||
# HIVE_HOME so the desktop's per-user root (set via env) wins
|
||||
# over the shared ``~/.hive`` location.
|
||||
from framework.config import HIVE_HOME
|
||||
|
||||
user_hive = HIVE_HOME / "skills"
|
||||
if user_hive.is_dir():
|
||||
self._scanned_dirs.append(user_hive)
|
||||
all_skills.extend(self._scan_scope(user_hive, "user"))
|
||||
|
||||
# Extra scopes (queen_ui / colony_ui), scanned between user and project
|
||||
# so colony overrides beat queen overrides, and both beat user-scope.
|
||||
for extra in self._config.extra_scopes:
|
||||
if extra.directory.is_dir():
|
||||
self._scanned_dirs.append(extra.directory)
|
||||
all_skills.extend(self._scan_scope(extra.directory, extra.label))
|
||||
|
||||
# Project scope (highest precedence)
|
||||
if self._config.project_root:
|
||||
root = self._config.project_root
|
||||
|
||||
@@ -15,14 +15,18 @@ import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from framework.config import HIVE_HOME
|
||||
from framework.skills.parser import ParsedSkill
|
||||
from framework.skills.skill_errors import SkillError, SkillErrorCode
|
||||
|
||||
# Default install destination for user-scope skills
|
||||
USER_SKILLS_DIR = Path.home() / ".hive" / "skills"
|
||||
# Default install destination for user-scope skills.
|
||||
# Anchored on HIVE_HOME so the desktop shell can override the install
|
||||
# root via $HIVE_HOME without patching every call site.
|
||||
USER_SKILLS_DIR = HIVE_HOME / "skills"
|
||||
|
||||
# Sentinel file for the one-time security notice on first install (NFR-5).
|
||||
INSTALL_NOTICE_SENTINEL = HIVE_HOME / ".install_notice_shown"
|
||||
|
||||
# Sentinel file for the one-time security notice on first install (NFR-5)
|
||||
INSTALL_NOTICE_SENTINEL = Path.home() / ".hive" / ".install_notice_shown"
|
||||
|
||||
_INSTALL_NOTICE = """\
|
||||
─────────────────────────────────────────────────────────────
|
||||
@@ -44,7 +48,7 @@ _INSTALL_NOTICE = """\
|
||||
def maybe_show_install_notice() -> None:
|
||||
"""Print a one-time security notice before the first skill install (NFR-5).
|
||||
|
||||
Touches a sentinel file in ~/.hive/ after showing the notice so it is
|
||||
Touches a sentinel file in $HIVE_HOME after showing the notice so it is
|
||||
only displayed once across all future installs.
|
||||
"""
|
||||
if INSTALL_NOTICE_SENTINEL.exists():
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user