Compare commits

...

12 Commits

Author SHA1 Message Date
Richard Tang 22df99ef51 Merge remote-tracking branch 'origin/main'
Release / Create Release (push) Waiting to run
2026-04-14 19:56:33 -07:00
Richard Tang edc3135797 Merge branch 'feature/new-colony' 2026-04-14 19:56:08 -07:00
Richard Tang 27b15789fb fix: skills prompts 2026-04-14 18:51:14 -07:00
RichardTang-Aden 5ba5933edc Merge pull request #7046 from vincentjiang777/main
docs: new readme
2026-04-14 18:02:49 -07:00
Timothy 50eb4b0e8f Merge branch 'feature/colony-creation' into feature/new-colony 2026-04-14 16:34:30 -07:00
Richard Tang 3e4a4c9924 Merge remote-tracking branch 'origin/feat/text-only-tool-filter' into feature/new-colony 2026-04-14 16:29:19 -07:00
Richard Tang c47987e73c fix: ask user widget fallback 2026-04-14 16:27:12 -07:00
Richard Tang 8f5daf0569 fix: swtiching model and new chat 2026-04-14 16:04:07 -07:00
bryan af5c72e785 feat: hide image-producing tools and vision-only prompt blocks from text-only models 2026-04-14 12:50:44 -07:00
bryan 5cdc01cb8c fix: preserve tool pill mapping across turn boundary for deferred ask_user completions 2026-04-14 10:56:38 -07:00
Hundao 2f58cce781 fix(tools): web_scrape truncation no longer exceeds max_length (#7044)
The previous code did `text[:max_length] + "..."`, which made the
returned content always 3 chars longer than the requested max_length.
Reserve room for the ellipsis inside the limit so the contract holds.

Fixes #2098
2026-04-14 14:24:42 +08:00
vincentjiang777 9dc214cfd2 Merge branch 'aden-hive:main' into main 2026-04-10 20:35:42 -07:00
22 changed files with 1018 additions and 175 deletions
+9 -2
View File
@@ -87,7 +87,7 @@ from framework.agent_loop.internals.types import (
)
from framework.agent_loop.types import AgentContext, AgentProtocol, AgentResult
from framework.host.event_bus import EventBus
from framework.llm.capabilities import supports_image_tool_results
from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results
from framework.llm.provider import Tool, ToolResult, ToolUse
from framework.llm.stream_events import (
FinishEvent,
@@ -632,13 +632,20 @@ class AgentLoop(AgentProtocol):
if isinstance(stream_id, str) and stream_id.startswith("worker:"):
tools.append(build_report_to_parent_tool())
# Hide image-producing tools from text-only models so they never try
# to call them. Avoids wasted turns + "screenshot failed" lessons
# getting saved to memory. See framework.llm.capabilities.
_llm_model = ctx.llm.model if ctx.llm else ""
tools, _hidden_image_tools = filter_tools_for_model(tools, _llm_model)
logger.info(
"[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s",
"[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s | hidden_image_tools=%s",
node_id,
len(tools),
[t.name for t in tools],
ctx.supports_direct_user_io,
type(self._judge).__name__ if self._judge else "None",
_hidden_image_tools,
)
# 4. Publish loop started
+26 -2
View File
@@ -1,5 +1,6 @@
"""Node definitions for Queen agent."""
import re
from pathlib import Path
from framework.orchestrator import NodeSpec
@@ -32,6 +33,29 @@ def _build_appendices() -> str:
return parts
# Wraps prompt sections that should only be shown to vision-capable models.
# Content inside `<!-- vision-only -->...<!-- /vision-only -->` is kept for
# vision models and stripped for text-only models. Applied once per session
# in queen_orchestrator.create_queen.
_VISION_ONLY_BLOCK_RE = re.compile(
r"<!-- vision-only -->(.*?)<!-- /vision-only -->",
re.DOTALL,
)
def finalize_queen_prompt(text: str, has_vision: bool) -> str:
"""Resolve `<!-- vision-only -->` blocks based on model capability.
For vision-capable models the markers are stripped and the inner
content is kept. For text-only models the whole block (markers +
content) is removed so the queen is never nudged toward tools it
cannot usefully invoke.
"""
if has_vision:
return _VISION_ONLY_BLOCK_RE.sub(r"\1", text)
return _VISION_ONLY_BLOCK_RE.sub("", text)
# Shared appendices — appended to every coding node's system prompt.
_appendices = _build_appendices()
@@ -504,7 +528,7 @@ The queen writes final production-ready system prompts directly.
MCP servers are loaded from the global registry by name. Available servers:
- `hive_tools` web search, email, CRM, calendar, 100+ integrations
- `gcu-tools` browser automation (click, type, navigate, screenshot)
- `gcu-tools` browser automation (click, type, navigate<!-- vision-only -->, screenshot<!-- /vision-only -->)
- `files-tools` file I/O (read, write, edit, search, list)
**Template variables:** Add a `variables:` section at the top of agent.json \
@@ -862,7 +886,7 @@ search_files, run_command, undo_changes
## Browser Automation (gcu-tools MCP)
All browser tools are prefixed with `browser_` (browser_start, browser_navigate, \
browser_click, browser_fill, browser_snapshot, browser_screenshot, browser_scroll, \
browser_click, browser_fill, browser_snapshot, <!-- vision-only -->browser_screenshot, <!-- /vision-only -->browser_scroll, \
browser_tabs, browser_close, browser_evaluate, etc.).
Follow the browser-automation skill protocol activate it before using browser tools.
@@ -21,7 +21,9 @@ All tools are prefixed with `browser_`:
- `browser_click`, `browser_click_coordinate`, `browser_fill`, `browser_type` — interact
- `browser_press` (with optional `modifiers=["ctrl"]` etc.) — keyboard shortcuts
- `browser_snapshot` — compact accessibility-tree read (structured)
<!-- vision-only -->
- `browser_screenshot` — visual capture (annotated PNG)
<!-- /vision-only -->
- `browser_shadow_query`, `browser_get_rect` — locate elements (shadow-piercing via `>>>`)
- `browser_coords` — convert image pixels to CSS pixels (always use `css_x/y`, never `physical_x/y`)
- `browser_scroll`, `browser_wait` — navigation helpers
+24
View File
@@ -12,6 +12,11 @@ Vision support rules are derived from official vendor documentation:
from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from framework.llm.provider import Tool
def _model_name(model: str) -> str:
"""Return the bare model name after stripping any 'provider/' prefix."""
@@ -104,3 +109,22 @@ def supports_image_tool_results(model: str) -> bool:
# 5. Default: assume vision capable
# Covers: OpenAI, Anthropic, Google, Mistral, Kimi, and other hosted providers
return True
def filter_tools_for_model(tools: list[Tool], model: str) -> tuple[list[Tool], list[str]]:
"""Drop image-producing tools for text-only models.
Returns ``(filtered_tools, hidden_names)``. For vision-capable models
(or when *model* is empty) the input list is returned unchanged and
``hidden_names`` is empty. For text-only models any tool with
``produces_image=True`` is removed so the LLM never sees it in its
schema avoids wasted calls and stale "screenshot failed" entries
in agent memory.
"""
if not model or supports_image_tool_results(model):
return list(tools), []
hidden = [t.name for t in tools if t.produces_image]
if not hidden:
return list(tools), []
kept = [t for t in tools if not t.produces_image]
return kept, hidden
+3
View File
@@ -27,6 +27,9 @@ class Tool:
name: str
description: str
parameters: dict[str, Any] = field(default_factory=dict)
# If True, the tool may return ImageContent in its result. Text-only models
# (e.g. glm-5, deepseek-chat) have this hidden from their schema entirely.
produces_image: bool = False
# If True, this tool performs no filesystem/process/network writes and is
# safe to run concurrently with other safe-flagged tools inside the same
# assistant turn. Unsafe tools (writes, shell, browser actions) are always
+12
View File
@@ -7,6 +7,7 @@ import inspect
import json
import logging
import os
import re
from collections.abc import Callable
from dataclasses import dataclass
from pathlib import Path
@@ -18,6 +19,16 @@ logger = logging.getLogger(__name__)
_INPUT_LOG_MAX_LEN = 500
# Tools whose names match this pattern are assumed to return ImageContent.
# Matched against the bare tool name (case-insensitive). Used to mark MCP
# tools with produces_image=True so they can be filtered out for text-only
# models before the schema is ever shown to the LLM (avoids wasted calls
# and "screenshot failed" entries polluting memory).
_IMAGE_TOOL_NAME_RE = re.compile(
r"(screenshot|screen_capture|capture_image|render_image|get_image|snapshot_image)",
re.IGNORECASE,
)
# Per-execution context overrides. Each asyncio task (and thus each
# concurrent graph execution) gets its own copy, so there are no races
# when multiple ExecutionStreams run in parallel.
@@ -998,6 +1009,7 @@ class ToolRegistry:
"properties": properties,
"required": required,
},
produces_image=bool(_IMAGE_TOOL_NAME_RE.search(mcp_tool.name or "")),
concurrency_safe=mcp_tool.name in self.CONCURRENCY_SAFE_TOOLS,
)
+53 -32
View File
@@ -311,7 +311,9 @@ async def create_queen(
_queen_tools_running,
_queen_tools_staging,
_shared_building_knowledge,
finalize_queen_prompt,
)
from framework.llm.capabilities import supports_image_tool_results
from framework.host.event_bus import AgentEvent, EventType
from framework.loader.mcp_registry import MCPRegistry
from framework.loader.tool_registry import ToolRegistry
@@ -489,6 +491,13 @@ async def create_queen(
"according to your current phase."
)
# Resolve vision-only prompt sections based on the session's LLM.
# session.llm is immutable for the session's lifetime, so this check
# is stable — prompts never need to be recomposed mid-session.
_has_vision = bool(
session.llm and supports_image_tool_results(getattr(session.llm, "model", ""))
)
_planning_body = (
_queen_character_core
+ _queen_role_planning
@@ -500,7 +509,7 @@ async def create_queen(
+ _planning_knowledge
+ worker_identity
)
phase_state.prompt_planning = _planning_body
phase_state.prompt_planning = finalize_queen_prompt(_planning_body, _has_vision)
_building_body = (
_queen_character_core
@@ -515,40 +524,52 @@ async def create_queen(
+ _appendices
+ worker_identity
)
phase_state.prompt_building = _building_body
phase_state.prompt_staging = (
_queen_character_core
+ _queen_role_staging
+ _queen_style
+ _queen_tools_staging
+ _queen_behavior_always
+ _queen_behavior_staging
+ worker_identity
phase_state.prompt_building = finalize_queen_prompt(_building_body, _has_vision)
phase_state.prompt_staging = finalize_queen_prompt(
(
_queen_character_core
+ _queen_role_staging
+ _queen_style
+ _queen_tools_staging
+ _queen_behavior_always
+ _queen_behavior_staging
+ worker_identity
),
_has_vision,
)
phase_state.prompt_running = (
_queen_character_core
+ _queen_role_running
+ _queen_style
+ _queen_tools_running
+ _queen_behavior_always
+ _queen_behavior_running
+ worker_identity
phase_state.prompt_running = finalize_queen_prompt(
(
_queen_character_core
+ _queen_role_running
+ _queen_style
+ _queen_tools_running
+ _queen_behavior_always
+ _queen_behavior_running
+ worker_identity
),
_has_vision,
)
phase_state.prompt_editing = (
_queen_identity_editing
+ _queen_style
+ _queen_tools_editing
+ _queen_behavior_always
+ _queen_behavior_editing
+ worker_identity
phase_state.prompt_editing = finalize_queen_prompt(
(
_queen_identity_editing
+ _queen_style
+ _queen_tools_editing
+ _queen_behavior_always
+ _queen_behavior_editing
+ worker_identity
),
_has_vision,
)
phase_state.prompt_independent = (
_queen_character_core
+ _queen_role_independent
+ _queen_style
+ _queen_tools_independent
+ _queen_behavior_always
+ _queen_behavior_independent
phase_state.prompt_independent = finalize_queen_prompt(
(
_queen_character_core
+ _queen_role_independent
+ _queen_style
+ _queen_tools_independent
+ _queen_behavior_always
+ _queen_behavior_independent
),
_has_vision,
)
# ---- Default skill protocols -------------------------------------
+7 -1
View File
@@ -284,10 +284,16 @@ def _get_subscription_token(sub_id: str) -> str | None:
def _hot_swap_sessions(
request: web.Request, full_model: str, api_key: str | None, api_base: str | None
) -> int:
"""Hot-swap the LLM on all running sessions. Returns count of swapped sessions."""
"""Hot-swap the LLM on all running sessions. Returns count of swapped sessions.
Also refreshes the SessionManager's default model so that subsequent
one-shot LLM consumers (e.g. /messages/classify, new session bootstrap)
pick up the new provider/model instead of the stale startup override.
"""
from framework.server.session_manager import SessionManager
manager: SessionManager = request.app["manager"]
manager._model = full_model
swapped = 0
for session in manager.list_sessions():
llm_provider = getattr(session, "llm", None)
+57 -18
View File
@@ -14,13 +14,37 @@ from framework.skills.skill_errors import SkillErrorCode, log_skill_error
logger = logging.getLogger(__name__)
_BEHAVIORAL_INSTRUCTION = (
"The following skills provide specialized instructions for specific tasks.\n"
"When a task matches a skill's description, read the SKILL.md at the listed\n"
"location to load the full instructions before proceeding.\n"
"When a skill references relative paths, resolve them against the skill's\n"
"directory (the parent of SKILL.md) and use absolute paths in tool calls."
)
# Upper bound on the raw `<available_skills>` XML body, in characters.
# When the full catalog (with <description> entries) exceeds this, we fall
# back to the compact variant that drops descriptions but keeps every skill
# visible. Preserving awareness of every skill beats truncating entries.
_COMPACT_THRESHOLD_CHARS = 5000
_MANDATORY_HEADER_FULL = """## Skills (mandatory)
Before replying: scan <available_skills> <description> entries.
- If exactly one skill clearly applies: read its SKILL.md at <location> with `read_file`, then follow it.
- If multiple could apply: choose the most specific one, then read/follow it.
- If none clearly apply: do not read any SKILL.md.
Constraints: never read more than one skill up front; only read after selecting.
- When a skill drives external API writes (Gmail, Calendar, GitHub, etc.), assume rate limits: prefer fewer larger writes, avoid tight one-item loops, serialize bursts when possible, and respect 429/Retry-After.
The following skills provide specialized instructions for specific tasks.
Use `read_file` to load a skill's SKILL.md when the task matches its description.
When a skill file references a relative path, resolve it against the skill directory (parent of SKILL.md) and use that absolute path in tool commands."""
_MANDATORY_HEADER_COMPACT = """## Skills (mandatory)
Before replying: scan <available_skills> <name> entries.
- If exactly one skill clearly applies: read its SKILL.md at <location> with `read_file`, then follow it.
- If multiple could apply: choose the most specific one, then read/follow it.
- If none clearly apply: do not read any SKILL.md.
Constraints: never read more than one skill up front; only read after selecting.
- When a skill drives external API writes (Gmail, Calendar, GitHub, etc.), assume rate limits: prefer fewer larger writes, avoid tight one-item loops, serialize bursts when possible, and respect 429/Retry-After.
The following skills provide specialized instructions for specific tasks.
Use `read_file` to load a skill's SKILL.md when the task matches its name.
When a skill file references a relative path, resolve it against the skill directory (parent of SKILL.md) and use that absolute path in tool commands."""
class SkillCatalog:
@@ -61,27 +85,42 @@ class SkillCatalog:
def to_prompt(self) -> str:
"""Generate the catalog prompt for system prompt injection.
Returns empty string if no community/user skills are discovered
(default skills are handled separately by DefaultSkillManager).
"""
# All skills go through the catalog for progressive disclosure.
all_skills = list(self._skills.values())
Returns empty string when no skills are present. Otherwise returns
a mandatory pre-reply checklist + decision rules + rate-limit note,
followed by the <available_skills> XML body.
When the full XML body exceeds ``_COMPACT_THRESHOLD_CHARS``, the
compact variant is emitted instead: <description> elements are
dropped so every skill stays visible before any gets truncated.
"""
all_skills = sorted(self._skills.values(), key=lambda s: s.name)
if not all_skills:
return ""
full_xml = self._render_xml(all_skills, compact=False)
if len(full_xml) <= _COMPACT_THRESHOLD_CHARS:
return f"{_MANDATORY_HEADER_FULL}\n\n{full_xml}"
compact_xml = self._render_xml(all_skills, compact=True)
return f"{_MANDATORY_HEADER_COMPACT}\n\n{compact_xml}"
@staticmethod
def _render_xml(skills: list[ParsedSkill], *, compact: bool) -> str:
"""Render the `<available_skills>` block.
``compact=True`` drops `<description>` to preserve skill awareness
when the catalog would otherwise blow the char budget.
"""
lines = ["<available_skills>"]
for skill in sorted(all_skills, key=lambda s: s.name):
for skill in skills:
lines.append(" <skill>")
lines.append(f" <name>{escape(skill.name)}</name>")
lines.append(f" <description>{escape(skill.description)}</description>")
if not compact:
lines.append(f" <description>{escape(skill.description)}</description>")
lines.append(f" <location>{escape(skill.location)}</location>")
lines.append(f" <base_dir>{escape(skill.base_dir)}</base_dir>")
lines.append(" </skill>")
lines.append("</available_skills>")
xml_block = "\n".join(lines)
return f"{_BEHAVIORAL_INSTRUCTION}\n\n{xml_block}"
return "\n".join(lines)
def build_pre_activated_prompt(self, skill_names: list[str]) -> str:
"""Build prompt content for pre-activated skills.
+245 -13
View File
@@ -212,6 +212,211 @@ function ToolActivityRow({ content }: { content: string }) {
);
}
// --- Inline ask_user fallback ---------------------------------------------
// Sometimes the model prints the ask_user / ask_user_multiple payload as
// regular assistant text instead of invoking the tool. We detect that
// payload here and render a QuestionWidget / MultiQuestionWidget inline so
// the user still gets the nice button UI. Submissions are sent back as a
// regular user message via onSend (there is no pending backend state to
// fulfill, so we treat it like the user answering in chat).
type AskUserInlinePayload =
| { kind: "single"; question: string; options: string[] }
| {
kind: "multi";
questions: { id: string; prompt: string; options?: string[] }[];
};
function detectAskUserPayload(content: string): AskUserInlinePayload | null {
if (!content) return null;
let text = content.trim();
if (!text) return null;
// Strip an optional ```json ... ``` / ``` ... ``` code fence
const fence = text.match(/^```(?:json|JSON)?\s*([\s\S]*?)\s*```$/);
if (fence) text = fence[1].trim();
// Strip surrounding double quotes that fully wrap a JSON object
if (text.length >= 2 && text.startsWith('"') && text.endsWith('"')) {
const inner = text.slice(1, -1).trim();
if (inner.startsWith("{") && inner.endsWith("}")) text = inner;
}
if (!text.startsWith("{") || !text.endsWith("}")) return null;
let parsed: unknown;
try {
parsed = JSON.parse(text);
} catch {
return null;
}
if (!parsed || typeof parsed !== "object") return null;
const obj = parsed as Record<string, unknown>;
// ask_user_multiple: { questions: [{ id, prompt, options? }, ...] }
if (Array.isArray(obj.questions)) {
const raw = obj.questions as unknown[];
if (raw.length < 1 || raw.length > 8) return null;
const questions: { id: string; prompt: string; options?: string[] }[] = [];
for (let i = 0; i < raw.length; i++) {
const q = raw[i];
if (!q || typeof q !== "object") return null;
const qo = q as Record<string, unknown>;
const prompt =
typeof qo.prompt === "string"
? qo.prompt
: typeof qo.question === "string"
? qo.question
: null;
if (!prompt) return null;
const id = typeof qo.id === "string" && qo.id ? qo.id : `q${i}`;
let options: string[] | undefined;
if (
Array.isArray(qo.options) &&
qo.options.every((o) => typeof o === "string")
) {
options = qo.options as string[];
}
questions.push({ id, prompt, options });
}
return { kind: "multi", questions };
}
// ask_user: { question: string, options: string[] }
const question = typeof obj.question === "string" ? obj.question : null;
const options =
Array.isArray(obj.options) &&
obj.options.every((o) => typeof o === "string")
? (obj.options as string[])
: null;
if (!question || !options || options.length < 2) return null;
return { kind: "single", question, options };
}
function InlineAskUserBubble({
msg,
payload,
activeThread,
onSend,
queenPhase,
showQueenPhaseBadge = true,
}: {
msg: ChatMessage;
payload: AskUserInlinePayload;
activeThread: string;
onSend: (
message: string,
thread: string,
images?: ImageContent[],
) => void;
queenPhase?: "planning" | "building" | "staging" | "running" | "independent";
showQueenPhaseBadge?: boolean;
}) {
const [state, setState] = useState<"pending" | "submitted" | "dismissed">(
"pending",
);
// Once the user submits an answer via the inline widget, hide the whole
// bubble — their reply appears right after as a normal user message.
if (state === "submitted") return null;
// If the user dismissed without answering, fall back to the regular
// MarkdownContent rendering so they can still see what the model said.
if (state === "dismissed") {
return (
<MessageBubble
msg={msg}
queenPhase={queenPhase}
showQueenPhaseBadge={showQueenPhaseBadge}
/>
);
}
const isQueen = msg.role === "queen";
const color = getColor(msg.agent, msg.role);
const thread = msg.thread || activeThread;
const handleSingle = (answer: string) => {
setState("submitted");
onSend(answer, thread);
};
const handleMulti = (answers: Record<string, string>) => {
setState("submitted");
if (payload.kind !== "multi") return;
// Format answers as a readable, numbered list for the outgoing message.
const lines = payload.questions.map((q, i) => {
const a = answers[q.id] ?? "";
return `${i + 1}. ${q.prompt}\n ${a}`;
});
onSend(lines.join("\n"), thread);
};
return (
<div className="flex gap-3">
<div
className={`flex-shrink-0 ${isQueen ? "w-9 h-9" : "w-7 h-7"} rounded-xl flex items-center justify-center`}
style={{
backgroundColor: `${color}18`,
border: `1.5px solid ${color}35`,
boxShadow: isQueen ? `0 0 12px ${color}20` : undefined,
}}
>
{isQueen ? (
<Crown className="w-4 h-4" style={{ color }} />
) : (
<Cpu className="w-3.5 h-3.5" style={{ color }} />
)}
</div>
<div
className={`flex-1 min-w-0 ${isQueen ? "max-w-[85%]" : "max-w-[75%]"}`}
>
<div className="flex items-center gap-2 mb-1">
<span
className={`font-medium ${isQueen ? "text-sm" : "text-xs"}`}
style={{ color }}
>
{msg.agent}
</span>
{(!isQueen || showQueenPhaseBadge) && (
<span
className={`text-[10px] font-medium px-1.5 py-0.5 rounded-md ${
isQueen
? "bg-primary/15 text-primary"
: "bg-muted text-muted-foreground"
}`}
>
{isQueen
? (msg.phase ?? queenPhase) === "independent"
? "independent"
: (msg.phase ?? queenPhase) === "running"
? "running"
: (msg.phase ?? queenPhase) === "staging"
? "staging"
: (msg.phase ?? queenPhase) === "planning"
? "planning"
: "building"
: "Worker"}
</span>
)}
</div>
{payload.kind === "single" ? (
<QuestionWidget
inline
question={payload.question}
options={payload.options}
onSubmit={handleSingle}
onDismiss={() => setState("dismissed")}
/>
) : (
<MultiQuestionWidget
inline
questions={payload.questions}
onSubmit={handleMulti}
onDismiss={() => setState("dismissed")}
/>
)}
</div>
</div>
);
}
const MessageBubble = memo(
function MessageBubble({
msg,
@@ -596,24 +801,51 @@ export default function ChatPanel({
onScroll={handleScroll}
className="flex-1 overflow-auto px-5 py-4 space-y-3"
>
{renderItems.map((item) =>
item.kind === "parallel" ? (
<div key={item.groupId}>
<ParallelSubagentBubble
groupId={item.groupId}
groups={item.groups}
/>
</div>
) : (
<div key={item.msg.id}>
{renderItems.map((item) => {
if (item.kind === "parallel") {
return (
<div key={item.groupId}>
<ParallelSubagentBubble
groupId={item.groupId}
groups={item.groups}
/>
</div>
);
}
const msg = item.msg;
// Detect misformatted ask_user payloads emitted as plain text and
// substitute the nicer widget-based bubble. Only inspect regular
// agent messages — skip system rows, tool status, dividers, etc.
const askPayload =
(msg.role === "queen" || msg.role === "worker") &&
!msg.type &&
msg.content
? detectAskUserPayload(msg.content)
: null;
if (askPayload) {
return (
<div key={msg.id}>
<InlineAskUserBubble
msg={msg}
payload={askPayload}
activeThread={activeThread}
onSend={onSend}
queenPhase={queenPhase}
showQueenPhaseBadge={showQueenPhaseBadge}
/>
</div>
);
}
return (
<div key={msg.id}>
<MessageBubble
msg={item.msg}
msg={msg}
queenPhase={queenPhase}
showQueenPhaseBadge={showQueenPhaseBadge}
/>
</div>
),
)}
);
})}
{/* Show typing indicator while waiting for first queen response (disabled + empty chat) */}
{(isWaiting || (disabled && threadMessages.length === 0)) && (
@@ -11,9 +11,15 @@ export interface MultiQuestionWidgetProps {
questions: QuestionItem[];
onSubmit: (answers: Record<string, string>) => void;
onDismiss?: () => void;
/**
* When true, skip the global Enter-to-submit listener. Use this when rendering
* the widget inline alongside other inputs (e.g. the chat textarea) so Enter
* isn't hijacked from the surrounding UI.
*/
inline?: boolean;
}
export default function MultiQuestionWidget({ questions, onSubmit, onDismiss }: MultiQuestionWidgetProps) {
export default function MultiQuestionWidget({ questions, onSubmit, onDismiss, inline = false }: MultiQuestionWidgetProps) {
// Per-question state: selected index (null = nothing, options.length = "Other")
const [selections, setSelections] = useState<(number | null)[]>(
() => questions.map(() => null),
@@ -50,8 +56,10 @@ export default function MultiQuestionWidget({ questions, onSubmit, onDismiss }:
onSubmit(answers);
}, [canSubmit, submitted, questions, selections, customTexts, onSubmit]);
// Enter to submit (only when not focused on a text input)
// Enter to submit (only when not focused on a text input).
// Skipped in inline mode so the widget doesn't hijack keys from surrounding inputs.
useEffect(() => {
if (inline) return;
const handleKeyDown = (e: KeyboardEvent) => {
if (submitted) return;
const target = e.target as HTMLElement;
@@ -63,7 +71,7 @@ export default function MultiQuestionWidget({ questions, onSubmit, onDismiss }:
};
window.addEventListener("keydown", handleKeyDown);
return () => window.removeEventListener("keydown", handleKeyDown);
}, [handleSubmit, submitted]);
}, [handleSubmit, submitted, inline]);
if (submitted) return null;
@@ -10,9 +10,15 @@ export interface QuestionWidgetProps {
onSubmit: (answer: string, isOther: boolean) => void;
/** Called when user dismisses the question without answering */
onDismiss?: () => void;
/**
* When true, the widget does not register a global keyboard listener. Set this
* when rendering the widget inline alongside other inputs (e.g. a chat textarea)
* so Enter / number keys do not get hijacked from the surrounding UI.
*/
inline?: boolean;
}
export default function QuestionWidget({ question, options, onSubmit, onDismiss }: QuestionWidgetProps) {
export default function QuestionWidget({ question, options, onSubmit, onDismiss, inline = false }: QuestionWidgetProps) {
const [selected, setSelected] = useState<number | null>(null);
const [customText, setCustomText] = useState("");
const [submitted, setSubmitted] = useState(false);
@@ -42,8 +48,10 @@ export default function QuestionWidget({ question, options, onSubmit, onDismiss
}
}, [canSubmit, submitted, isOtherSelected, customText, options, selected, onSubmit]);
// Keyboard: Enter to submit, number keys to select (only when text input is not focused)
// Keyboard: Enter to submit, number keys to select (only when text input is not focused).
// Skipped in inline mode so the widget doesn't hijack keys from surrounding inputs.
useEffect(() => {
if (inline) return;
const handleKeyDown = (e: KeyboardEvent) => {
if (submitted) return;
const inTextInput = e.target === inputRef.current;
@@ -66,7 +74,7 @@ export default function QuestionWidget({ question, options, onSubmit, onDismiss
window.addEventListener("keydown", handleKeyDown);
return () => window.removeEventListener("keydown", handleKeyDown);
}, [handleSubmit, submitted, options.length]);
}, [handleSubmit, submitted, options.length, inline]);
if (submitted) return null;
+77 -21
View File
@@ -238,6 +238,12 @@ export default function ColonyChat() {
agentStateRef.current = agentState;
const turnCounterRef = useRef<Record<string, number>>({});
// Maps tool_use_id → the pill message ID and tool name that was created for it.
// Survives turn counter resets so deferred completions (e.g. ask_user) can
// find and update the correct pill even after the counter changes.
const toolUseToPillRef = useRef<
Record<string, { msgId: string; name: string }>
>({});
const queenPhaseRef = useRef<string>("planning");
const queenIterTextRef = useRef<Record<string, Record<number, string>>>({});
const suppressIntroRef = useRef(false);
@@ -468,6 +474,7 @@ export default function ColonyChat() {
setGraphNodes([]);
setAgentState(defaultAgentState());
turnCounterRef.current = {};
toolUseToPillRef.current = {};
queenPhaseRef.current = "planning";
queenIterTextRef.current = {};
suppressIntroRef.current = false;
@@ -782,6 +789,12 @@ export default function ColonyChat() {
const toolUseId = (event.data?.tool_use_id as string) || "";
const sid = event.stream_id;
// Track which pill message this tool belongs to so deferred
// completions (ask_user) can find it after the turn counter changes.
toolUseToPillRef.current[toolUseId] = {
msgId: `tool-pill-${sid}-${event.execution_id || "exec"}-${currentTurn}`,
name: toolName,
};
setAgentState((prev) => {
const newActive = {
...prev.activeToolCalls,
@@ -826,30 +839,73 @@ export default function ColonyChat() {
appendNodeLog(event.node_id, `${ts} INFO ${toolName} done${resultStr}`);
}
// Look up the original pill message this tool belongs to.
// For deferred completions (ask_user), the turn counter and
// activeToolCalls have already been reset, so we rely on the
// ref recorded during tool_call_started.
const tracked = toolUseToPillRef.current[toolUseId];
delete toolUseToPillRef.current[toolUseId];
const sid = event.stream_id;
// Mark done in activeToolCalls if still present (normal case)
setAgentState((prev) => {
const updated = { ...prev.activeToolCalls };
if (updated[toolUseId]) {
updated[toolUseId] = { ...updated[toolUseId], done: true };
if (!prev.activeToolCalls[toolUseId]) return prev;
return {
...prev,
activeToolCalls: {
...prev.activeToolCalls,
[toolUseId]: {
...prev.activeToolCalls[toolUseId],
done: true,
},
},
};
});
// Determine the correct pill message ID
const pillMsgId =
tracked?.msgId ??
`tool-pill-${sid}-${event.execution_id || "exec"}-${currentTurn}`;
const trackedName = tracked?.name;
// Update the pill message content directly
setMessages((prevMsgs) => {
const idx = prevMsgs.findIndex((m) => m.id === pillMsgId);
if (idx < 0) return prevMsgs;
try {
const parsed = JSON.parse(prevMsgs[idx].content);
const tools: { name: string; done: boolean }[] =
parsed.tools || [];
if (trackedName) {
let marked = false;
for (let i = 0; i < tools.length; i++) {
if (
tools[i].name === trackedName &&
!tools[i].done &&
!marked
) {
tools[i] = { ...tools[i], done: true };
marked = true;
}
}
}
const allDone =
tools.length > 0 && tools.every((t) => t.done);
return prevMsgs.map((m, i) =>
i === idx
? {
...m,
content: JSON.stringify({ tools, allDone }),
}
: m,
);
} catch {
return prevMsgs;
}
const tools = Object.values(updated)
.filter((t) => t.streamId === sid)
.map((t) => ({ name: t.name, done: t.done }));
const allDone = tools.length > 0 && tools.every((t) => t.done);
upsertMessage({
id: `tool-pill-${sid}-${event.execution_id || "exec"}-${currentTurn}`,
agent: agentDisplayName || event.node_id || "Agent",
agentColor: "",
content: JSON.stringify({ tools, allDone }),
timestamp: "",
type: "tool_status",
role,
thread: agentPath,
createdAt: eventCreatedAt,
nodeId: event.node_id || undefined,
executionId: event.execution_id || undefined,
});
return { ...prev, activeToolCalls: updated };
});
}
break;
+72 -48
View File
@@ -58,6 +58,12 @@ export default function QueenDM() {
const [cloneTask, setCloneTask] = useState("");
const turnCounterRef = useRef(0);
// Maps tool_use_id → the pill message ID and tool name that was created for it.
// Survives turn counter resets so deferred completions (e.g. ask_user) can
// find and update the correct pill even after llm_turn_complete bumps the counter.
const toolUseToPillRef = useRef<
Record<string, { msgId: string; name: string }>
>({});
const queenIterTextRef = useRef<Record<string, Record<number, string>>>({});
const [queenPhase, setQueenPhase] = useState<
"planning" | "building" | "staging" | "running" | "independent"
@@ -77,6 +83,7 @@ export default function QueenDM() {
setQueenPhase("independent");
setInitialDraft(null);
turnCounterRef.current = 0;
toolUseToPillRef.current = {};
queenIterTextRef.current = {};
}, []);
@@ -390,6 +397,7 @@ export default function QueenDM() {
setIsTyping(true);
setQueenReady(true);
setActiveToolCalls({});
toolUseToPillRef.current = {};
// Clear queued flag on all user messages now that the queen is processing
setMessages((prev) => {
if (!prev.some((m) => m.queued)) return prev;
@@ -560,6 +568,11 @@ export default function QueenDM() {
? new Date(event.timestamp).getTime()
: Date.now();
// Track which pill message this tool belongs to so deferred
// completions (ask_user) can find it after the turn counter changes.
const msgId = `tool-pill-${sid}-${execId}-${turnCounterRef.current}`;
toolUseToPillRef.current[toolUseId] = { msgId, name: toolName };
setActiveToolCalls((prev) => {
const newActive = {
...prev,
@@ -570,7 +583,6 @@ export default function QueenDM() {
done: t.done,
}));
const allDone = tools.length > 0 && tools.every((t) => t.done);
const msgId = `tool-pill-${sid}-${execId}-${turnCounterRef.current}`;
const toolMsg: ChatMessage = {
id: msgId,
agent: queenName,
@@ -611,57 +623,68 @@ export default function QueenDM() {
case "tool_call_completed": {
const toolUseId = (event.data?.tool_use_id as string) || "";
// Look up the original pill message this tool belongs to.
// For deferred completions (ask_user), the turn counter and
// activeToolCalls have already been reset by llm_turn_complete,
// so we rely on the ref recorded during tool_call_started.
const tracked = toolUseToPillRef.current[toolUseId];
delete toolUseToPillRef.current[toolUseId];
// Mark done in activeToolCalls if still present (normal case)
setActiveToolCalls((prev) => {
if (!prev[toolUseId]) return prev;
return {
...prev,
[toolUseId]: { ...prev[toolUseId], done: true },
};
});
// Determine the correct pill message ID
const sid = event.stream_id;
const execId = event.execution_id || "exec";
const eventCreatedAt = event.timestamp
? new Date(event.timestamp).getTime()
: Date.now();
const pillMsgId =
tracked?.msgId ??
`tool-pill-${sid}-${execId}-${turnCounterRef.current}`;
const toolName = tracked?.name;
setActiveToolCalls((prev) => {
const updated = { ...prev };
if (updated[toolUseId]) {
updated[toolUseId] = { ...updated[toolUseId], done: true };
// Update the pill message content directly
setMessages((prevMsgs) => {
const idx = prevMsgs.findIndex((m) => m.id === pillMsgId);
if (idx < 0) return prevMsgs;
try {
const parsed = JSON.parse(prevMsgs[idx].content);
const tools: { name: string; done: boolean }[] =
parsed.tools || [];
if (toolName) {
let marked = false;
for (let i = 0; i < tools.length; i++) {
if (
tools[i].name === toolName &&
!tools[i].done &&
!marked
) {
tools[i] = { ...tools[i], done: true };
marked = true;
}
}
}
const allDone =
tools.length > 0 && tools.every((t) => t.done);
return prevMsgs.map((m, i) =>
i === idx
? {
...m,
content: JSON.stringify({ tools, allDone }),
}
: m,
);
} catch {
return prevMsgs;
}
const tools = Object.entries(updated).map(([, t]) => ({
name: t.name,
done: t.done,
}));
const allDone = tools.length > 0 && tools.every((t) => t.done);
const msgId = `tool-pill-${sid}-${execId}-${turnCounterRef.current}`;
const toolMsg: ChatMessage = {
id: msgId,
agent: queenName,
agentColor: "",
content: JSON.stringify({ tools, allDone }),
timestamp: "",
type: "tool_status",
role: "queen",
thread: "queen-dm",
createdAt: eventCreatedAt,
nodeId: event.node_id || undefined,
executionId: event.execution_id || undefined,
};
setMessages((prevMsgs) => {
const idx = prevMsgs.findIndex((m) => m.id === msgId);
if (idx >= 0) {
return prevMsgs.map((m, i) =>
i === idx ? { ...toolMsg, createdAt: m.createdAt ?? toolMsg.createdAt } : m,
);
}
// Insert in sorted position by createdAt
const ts = toolMsg.createdAt ?? Date.now();
let insertIdx = prevMsgs.length - 1;
while (insertIdx >= 0 && (prevMsgs[insertIdx].createdAt ?? 0) > ts) {
insertIdx--;
}
if (insertIdx === -1 || insertIdx === prevMsgs.length - 1) {
return [...prevMsgs, toolMsg];
}
const next = [...prevMsgs];
next.splice(insertIdx + 1, 0, toolMsg);
return next;
});
return updated;
});
break;
}
@@ -746,6 +769,7 @@ export default function QueenDM() {
setIsTyping(false);
setIsStreaming(false);
setActiveToolCalls({});
toolUseToPillRef.current = {};
// Clear queued flags since the queen is now idle
setMessages((prev) => {
if (!prev.some((m) => m.queued)) return prev;
+55 -1
View File
@@ -4,7 +4,8 @@ from __future__ import annotations
import pytest
from framework.llm.capabilities import supports_image_tool_results
from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results
from framework.llm.provider import Tool
class TestSupportsImageToolResults:
@@ -56,3 +57,56 @@ class TestSupportsImageToolResults:
assert supports_image_tool_results("DeepSeek/deepseek-chat") is False
assert supports_image_tool_results("OLLAMA/llama3") is False
assert supports_image_tool_results("GPT-4o") is True
class TestFilterToolsForModel:
"""Verify ``filter_tools_for_model`` — the real helper used by AgentLoop."""
def test_hides_image_tool_from_text_only_model(self):
tools = [
Tool(name="read_file", description="read a file"),
Tool(name="browser_screenshot", description="take a screenshot", produces_image=True),
Tool(name="browser_snapshot", description="get page content"),
]
filtered, hidden = filter_tools_for_model(tools, "glm-5")
names = [t.name for t in filtered]
assert "browser_screenshot" not in names
assert "read_file" in names
assert "browser_snapshot" in names
assert hidden == ["browser_screenshot"]
def test_keeps_image_tool_for_vision_model(self):
tools = [
Tool(name="read_file", description="read a file"),
Tool(name="browser_screenshot", description="take a screenshot", produces_image=True),
]
filtered, hidden = filter_tools_for_model(tools, "claude-sonnet-4-20250514")
assert {t.name for t in filtered} == {"read_file", "browser_screenshot"}
assert hidden == []
def test_default_tools_are_not_filtered(self):
"""Tools without produces_image (default False) are kept for all models."""
tools = [
Tool(name="read_file", description="read a file"),
Tool(name="web_search", description="search the web"),
]
text_only, text_hidden = filter_tools_for_model(tools, "glm-5")
vision, vision_hidden = filter_tools_for_model(tools, "gpt-4o")
assert len(text_only) == 2 and text_hidden == []
assert len(vision) == 2 and vision_hidden == []
def test_empty_model_string_returns_tools_unchanged(self):
"""Guards the ctx.llm-missing path where model is empty."""
tools = [
Tool(name="browser_screenshot", description="", produces_image=True),
]
filtered, hidden = filter_tools_for_model(tools, "")
assert len(filtered) == 1
assert hidden == []
def test_returned_list_is_a_copy(self):
"""Caller should be free to mutate the filtered list without affecting input."""
tools = [Tool(name="read_file", description="")]
filtered, _ = filter_tools_for_model(tools, "gpt-4o")
filtered.append(Tool(name="extra", description=""))
assert len(tools) == 1
+65
View File
@@ -0,0 +1,65 @@
"""Tests for vision-only prompt block stripping in Queen nodes.
Covers ``finalize_queen_prompt`` the function that resolves
``<!-- vision-only -->...<!-- /vision-only -->`` markers in Queen phase
prompts before they reach the LLM. Vision-capable models see the inner
content; text-only models see the block removed entirely.
"""
from __future__ import annotations
from framework.agents.queen.nodes import finalize_queen_prompt
class TestFinalizeQueenPrompt:
def test_vision_model_keeps_inner_content_and_strips_markers(self):
text = "before <!-- vision-only -->secret<!-- /vision-only --> after"
result = finalize_queen_prompt(text, has_vision=True)
assert result == "before secret after"
def test_text_only_model_removes_entire_block(self):
text = "before <!-- vision-only -->secret<!-- /vision-only --> after"
result = finalize_queen_prompt(text, has_vision=False)
assert result == "before after"
assert "secret" not in result
assert "vision-only" not in result
def test_multiline_block_handled(self):
"""Regex must use DOTALL so blocks can span newlines."""
text = (
"- item 1\n"
"<!-- vision-only -->\n"
"- item 2 (vision only)\n"
"<!-- /vision-only -->\n"
"- item 3\n"
)
vision = finalize_queen_prompt(text, has_vision=True)
text_only = finalize_queen_prompt(text, has_vision=False)
assert "- item 2 (vision only)" in vision
assert "- item 2 (vision only)" not in text_only
assert "- item 1" in text_only and "- item 3" in text_only
def test_multiple_blocks_in_same_text(self):
text = (
"A <!-- vision-only -->X<!-- /vision-only --> "
"B <!-- vision-only -->Y<!-- /vision-only --> C"
)
assert finalize_queen_prompt(text, has_vision=True) == "A X B Y C"
assert finalize_queen_prompt(text, has_vision=False) == "A B C"
def test_non_greedy_match_does_not_swallow_between_blocks(self):
"""A naïve greedy regex would match from the first opening marker
to the last closing marker and wipe out the middle section. Lock
that down so a future refactor can't regress to greedy."""
text = (
"<!-- vision-only -->first<!-- /vision-only -->"
"KEEP"
"<!-- vision-only -->second<!-- /vision-only -->"
)
assert finalize_queen_prompt(text, has_vision=False) == "KEEP"
assert finalize_queen_prompt(text, has_vision=True) == "firstKEEPsecond"
def test_text_without_markers_is_unchanged(self):
text = "plain prompt with no markers at all"
assert finalize_queen_prompt(text, has_vision=True) == text
assert finalize_queen_prompt(text, has_vision=False) == text
+37 -3
View File
@@ -94,7 +94,10 @@ class TestSkillCatalog:
assert "<name>beta</name>" in prompt
assert "<description>Alpha skill</description>" in prompt
assert "<location>/p/alpha/SKILL.md</location>" in prompt
assert "<base_dir>/p/alpha</base_dir>" in prompt
# <base_dir> is intentionally not emitted — the mandatory header
# tells the model to resolve relative paths against the parent of
# SKILL.md, so the redundant element was dropped.
assert "<base_dir>" not in prompt
def test_to_prompt_sorted_by_name(self):
skills = [
@@ -130,13 +133,44 @@ class TestSkillCatalog:
assert "<name>usr</name>" in prompt
assert "<name>fw</name>" in prompt
def test_to_prompt_contains_behavioral_instruction(self):
def test_to_prompt_contains_mandatory_header(self):
"""The rendered catalog must carry the mandatory pre-reply checklist
so soft guidance turns into a required step."""
catalog = SkillCatalog([_make_skill(source_scope="project")])
prompt = catalog.to_prompt()
assert "When a task matches a skill's description" in prompt
assert "## Skills (mandatory)" in prompt
assert "Before replying: scan <available_skills>" in prompt
assert "never read more than one skill up front" in prompt
assert "`read_file`" in prompt
assert "SKILL.md" in prompt
def test_to_prompt_compact_fallback_drops_descriptions(self):
"""When the full XML body exceeds the char threshold, the compact
variant drops <description> but keeps every skill's <name>."""
# Each skill contributes ~100+ chars with a long description.
# 60 skills easily pushes the body past the threshold.
skills = [
_make_skill(
name=f"skill-{i:03d}",
description="A reasonably long description " * 4,
location=f"/s/skill-{i:03d}/SKILL.md",
base_dir=f"/s/skill-{i:03d}",
)
for i in range(60)
]
catalog = SkillCatalog(skills)
prompt = catalog.to_prompt()
# Mandatory header still present but uses the compact variant wording.
assert "## Skills (mandatory)" in prompt
assert "scan <available_skills> <name>" in prompt
# Every skill's name survives …
for i in range(60):
assert f"<name>skill-{i:03d}</name>" in prompt
# … but no descriptions were rendered.
assert "<description>" not in prompt
def test_build_pre_activated_prompt(self):
skill = _make_skill("research", body="## Deep Research\nDo thorough research.")
catalog = SkillCatalog([skill])
+6 -26
View File
@@ -1,9 +1,14 @@
"""Tests for AS-6 skill resource loading support.
Covers:
- <base_dir> element in catalog XML
- allowlisted_dirs property reflects trusted skill base directories
- skill_dirs propagation to NodeContext
The catalog XML previously emitted a redundant <base_dir> element next to
each <location>. That was dropped when the mandatory header took over the
"resolve relative paths against the parent of SKILL.md" instruction, so
there is no longer an XML-emission test for base_dir. Programmatic access
via ``catalog.allowlisted_dirs`` is still covered below.
"""
from framework.skills.catalog import SkillCatalog
@@ -26,31 +31,6 @@ def _make_skill(
class TestSkillResourceBaseDir:
def test_base_dir_in_xml(self):
"""Each community skill entry should expose its base_dir in the catalog XML."""
skill = _make_skill("deploy", "/project/.hive/skills/deploy")
catalog = SkillCatalog([skill])
prompt = catalog.to_prompt()
assert "<base_dir>/project/.hive/skills/deploy</base_dir>" in prompt
def test_base_dir_xml_escaped(self):
"""base_dir with XML-special chars should be escaped."""
skill = _make_skill("s", "/path/with <&> chars")
catalog = SkillCatalog([skill])
prompt = catalog.to_prompt()
assert "<base_dir>/path/with &lt;&amp;&gt; chars</base_dir>" in prompt
def test_base_dir_present_for_framework_skills(self):
"""Framework-scope skills now appear in the catalog like any other scope,
and their base_dir is included in the XML."""
skill = _make_skill("fw", "/hive/_default_skills/fw", source_scope="framework")
catalog = SkillCatalog([skill])
prompt = catalog.to_prompt()
assert "<name>fw</name>" in prompt
assert "<base_dir>/hive/_default_skills/fw</base_dir>" in prompt
def test_allowlisted_dirs_matches_skills(self):
"""allowlisted_dirs returns all skill base_dirs including framework ones."""
skills = [
+49
View File
@@ -799,6 +799,55 @@ def test_resync_returns_false_when_credentials_unchanged(tmp_path, monkeypatch):
assert registry.resync_mcp_servers_if_needed() is False
class TestMcpToolProducesImageFlag:
"""Verify _convert_mcp_tool_to_framework_tool sets produces_image from the name.
This is the detection step that the filter in AgentLoop depends on
if the regex regresses, text-only models will start seeing screenshot
tools they can't use.
"""
@staticmethod
def _mcp_tool(name: str):
return SimpleNamespace(
name=name,
description=f"{name} description",
input_schema={"type": "object", "properties": {}, "required": []},
server_name="test",
)
def test_screenshot_flagged(self):
registry = ToolRegistry()
mcp = self._mcp_tool("browser_screenshot")
tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001
assert tool.produces_image is True
def test_snapshot_not_flagged(self):
"""browser_snapshot returns a DOM tree, not an image — must not match."""
registry = ToolRegistry()
mcp = self._mcp_tool("browser_snapshot")
tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001
assert tool.produces_image is False
def test_case_insensitive_match(self):
registry = ToolRegistry()
mcp = self._mcp_tool("TakeScreenshot")
tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001
assert tool.produces_image is True
def test_plain_tool_not_flagged(self):
registry = ToolRegistry()
mcp = self._mcp_tool("read_file")
tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001
assert tool.produces_image is False
def test_image_suffix_variants_flagged(self):
registry = ToolRegistry()
for name in ("capture_image", "render_image", "get_image", "snapshot_image"):
tool = registry._convert_mcp_tool_to_framework_tool(self._mcp_tool(name)) # noqa: SLF001
assert tool.produces_image is True, f"{name} should be flagged"
# ---------------------------------------------------------------------------
# Concurrency-safe flag propagation
# ---------------------------------------------------------------------------
+176
View File
@@ -0,0 +1,176 @@
# 🐝 Hive Agent v0.10.0: The Colony
> ⚠️ **Breaking change.** This is a large architectural refactor of how agents work in Hive. **Old agents are no longer compatible.** Existing workspaces, custom agents, and saved sessions from pre-v0.10.0 builds will need to be recreated.
---
## ✨ Highlights
The **Colony** introduces a new way of working: a group of specialized workers operating together to run and scale your business.
The role of the **Queen** has evolved. Instead of only orchestrating, the Queen now **executes work first** to deliver immediate value, then **builds systems around that work** to create stable, repeatable business processes.
You now have a full leadership team of eight Queens, each with their own identity, expertise, and voice:
| Queen | Role |
| --- | --- |
| **Sophia** | Head of Brand & Design |
| **Charlotte** | Head of Finance & Fundraising |
| **Victoria** | Head of Growth |
| **Eleanor** | Head of Legal |
| **Rachel** | Head of Operations |
| **Isabella** | Head of Product Strategy |
| **Amelia** | Head of Talent |
| **Alexandra** | Head of Technology |
Start automating your business processes with your Queens today.
---
## 🏛️ The Colony Architecture
### Queens as Identities, Not Just Orchestrators
- **Queen profiles** — each queen is a YAML-backed persona (`~/.hive/agents/queens/{queen_id}/profile.yaml`) with core traits, hidden background, psychological profile, behavior triggers, and skill sets. Profiles are injected into the system prompt at session start.
- **CEO-style queen selection** — an LLM classifier routes every new user request to the best-matching queen based on the task at hand, with structured routing diagnostics (`QueenSelection`).
- **Queen DMs** — direct-message pages for each queen with a dedicated session flow, session switcher, and prompt library integration.
- **Independent / PM mode** — queens run in an independent mode for planning-phase work, with a "think out loud" internal monologue surfaced through internal tags.
- **Queen memory v2** — simplified memory implementation with reflection agent, cooldown-gated reflections, user identity, doppelganger wiring, and recall-selector for targeted retrieval.
- **Queen lifecycle tools** — first-class tools for escalation, queen reply, and session handoff.
### Colony Runtime
- **Grand architecture revamp** — the framework, agent loop, runtime, graph, pipeline, executor, and node worker layers have been rewritten from the ground up. Deprecated shims and legacy orchestration paths have been removed.
- **Colony creation flow** — colonies are created via skill, with reliable event bus subscription, worker spawning, and post-creation list refresh.
- **Scheduled triggers** — colonies can now be woken on a cron schedule, with triggers firing directly into the owning queen's session.
- **Simple fork** for agents, stable credential states, and improved worker execution reliability.
---
## 🆕 What's New
### Colony & Queens
- 8 default queen personas (Alexandra, Victoria, Isabella, Charlotte, Eleanor, Sophia, Amelia, Rachel) with profile YAML, examples, and behavior triggers
- LLM-based queen selector with reasoning output
- Queen DM page, queen session switcher, and sidebar queen item
- Queen scope memory, role examples, and identity loading
- Reflection agent with cooldown and improved reflection runner
- Queen orchestrator + `routes_queens` API
- Natural chat replies and cleaner home-prompt bootstrap
- Queen identity for new sessions
- `ask_user` / `ask_user_multiple` tools available in queen prompt
- Escalation and queen-reply tools
### Skills & Tools
- **Learned default skills** — skills the queen has learned become part of her baseline
- **Tool-gated skill activation** — skills only activate when their required tools are present
- **Skills for colonies** — per-colony skill registration and loading
- **Text-only model filter** — image-producing tools and vision-only prompt blocks are hidden from text-only models
- **Browser skills upgrade** — improved click reliability, screenshot capture, and credential filtering
- **Deprecated-tool removal** and alignment of Hive tool names across the codebase
- **Ask-user widget** with fallback rendering and preserved tool pill mapping across turn boundaries for deferred completions
- **Improved tool-call reliability** across the board (tool limit removed, tool blacklist, tool credential filter)
- **MCP** — efficient MCP loading at initialization, default MCP bootstrapping, registered available MCP tools, fixed MCP tool initialization and registry pipeline stage
### LLM & Credentials
- **Key pool** for credential management with stable credential states
- **Aden credentials storage adapter** and subscription-based LLM config activation endpoint
- **Consolidated model config** with unified model catalog
- **New providers** — Kimi, Hive, and Aden added to the model catalog
- **Model switcher** UI with runtime model switching API
- **LLM key validation endpoint** with agent errors surfaced via SSE
- **BYOK modal** import fixes for subscription token detection
### Frontend
- **Home redesign** — new home, credentials, and org chart pages
- **Colony chat** and **queen DM** pages
- **Sidebar + header** components and global app layout/routing
- **Model switcher, settings modal, template card**
- **Prompt library** with search, category filtering, and UI polish
- **Side panel** fixes and sub-agent pane light-mode support
- **Flowchart** light-mode support and normalized settings modal sizing
- **User profile settings** and UI enhancements
- **Sync user profile** to global memory as `user-profile.md`; queen profile API transformation
- Removed the old workspace GUI and its dependencies
### Framework & Runtime
- Architecture revamp: new runtime config, simplified agent loading, new infra for queen
- Home hive directory structure refactor
- Agent loading pipeline fixes, MCP registry pipeline stage fix
- Session resume improvements: separate resume vs new-session flow for queen sessions, edge-case fix for message injection in resumed sessions
- Strip internal tags from user-visible output
- Colony event bus subscription fixes and shared event bus for parent visibility
- Worker spawn and stop-worker fixes
- Default log level and extra logging hooks
---
## 🐛 Bug Fixes
- **Ask-user widget** — fallback when widget fails to mount
- **Skill loading** for colonies and proper skill resolution across queen sessions
- **Model switching** and new-chat flow no longer carry stale state
- **Tool pill mapping** preserved across turn boundary for deferred `ask_user` completions
- **Tool limit** removed (was capping legitimate long tool lists)
- **Queen loading** stability fixes
- **Side panel** rendering issues
- **Deprecated graphs** removed from UI
- **Home-page prompts** now reach the queen directly without waiting for the greeting to finish
- **Colony creation** link, reframing, and post-creation refresh
- **Build error** in colony creation path
- **GCU system prompt** tuning
- **Tool credential filter** correctness
- **Screenshot** capture and browser click reliability
- **Queen message injection** when resuming a session
- **Internal-tag diction** fixes in surfaced output
- **MCP tool initialization** on cold start
- **Frontend DM** edge cases
- **Prompt library** new-session handling for new chat
- **Config validation** and unavailable Minimax model handling
- **Queen identity** loading on cold boot
- **Extra text** in queen selector JSON response parsed safely
- **Outdated queen communication prompt** removed
---
## 🧹 Refactor & Cleanup
- **Shatter the Eld\*n ring** — top-to-bottom refactor of the runtime core
- **Grand clean-up** of deprecated code paths
- **Remove deprecated shims** and old session-status tools
- **Big test cleanup** — integration tests and component tests rewritten around the new architecture
- **Update references** for orchestrator / host / loader renames
- **Consolidate tests** for queen state machine and verified outcomes
- **Remove old workspace GUI** and its dependencies
- **Remove old "new agent" button** and deprecated entry points
- **Home hive directory** structure refactor
---
## ⚠️ Breaking Changes
- **Old agents are not compatible.** Custom agents authored against the pre-v0.10.0 framework will need to be re-authored against the new Queen/Colony runtime.
- **Session format** — pre-v0.10.0 sessions cannot be resumed.
- **Deprecated tools removed** and Hive tool names have been realigned; any external scripts referencing old tool names must be updated.
- **Old session-status tools** removed in favor of the new queen lifecycle tools.
- **Workspace GUI removed** — the legacy workspace UI is gone; use the new home, colony chat, and queen DM pages.
- **MCP registry pipeline** — MCP configurations now load through the new registry; custom MCP setups may need to be re-registered.
---
## 🚀 Upgrading
Because this release rewrites the agent runtime, the recommended upgrade path is:
1. Back up `~/.hive/` if you have sessions or custom agents you want to reference.
2. Pull `main` at the v0.10.0 tag.
3. Let Hive initialize the new queen profiles under `~/.hive/agents/queens/`.
4. Re-create any custom agents as colonies/queens against the new framework.
5. Re-register any custom MCP servers through the new MCP registry.
Welcome to the Colony. 🐝
@@ -255,9 +255,10 @@ def register_tools(mcp: FastMCP) -> None:
# Clean up whitespace
text = " ".join(text.split())
# Truncate if needed
# Truncate if needed (reserve 3 chars for the ellipsis so the
# final string stays within max_length)
if len(text) > max_length:
text = text[:max_length] + "..."
text = text[: max_length - 3] + "..."
result: dict[str, Any] = {
"url": url,
+18
View File
@@ -113,6 +113,24 @@ class TestWebScrapeTool:
assert isinstance(result, dict)
assert "error" not in result
@pytest.mark.asyncio
@patch(_STEALTH_PATH)
@patch(_PW_PATH)
async def test_truncation_respects_max_length(self, mock_pw, mock_stealth, web_scrape_fn):
"""Truncated content (including the ellipsis) must not exceed max_length."""
# max_length is clamped to >=1000, so build content larger than that
long_text = "a" * 5000
html = f"<html><body>{long_text}</body></html>"
mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
mock_pw.return_value = mock_cm
mock_stealth.return_value.apply_stealth_async = AsyncMock()
result = await web_scrape_fn(url="https://example.com", max_length=1000)
assert "error" not in result
assert len(result["content"]) <= 1000
assert result["content"].endswith("...")
assert result["length"] == len(result["content"])
@pytest.mark.asyncio
@patch(_STEALTH_PATH)
@patch(_PW_PATH)