Merge remote-tracking branch 'origin/main'

Merge branch 'feature/new-colony'
fix: skills prompts
2026-04-14 19:56:33 -07:00 · 2026-04-14 19:56:08 -07:00 · 2026-04-14 18:51:14 -07:00 · 2026-04-14 18:02:49 -07:00 · 2026-04-14 16:34:30 -07:00 · 2026-04-14 16:29:19 -07:00
22 changed files with 1018 additions and 175 deletions
@@ -87,7 +87,7 @@ from framework.agent_loop.internals.types import (
 )
 from framework.agent_loop.types import AgentContext, AgentProtocol, AgentResult
 from framework.host.event_bus import EventBus
-from framework.llm.capabilities import supports_image_tool_results
+from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results
 from framework.llm.provider import Tool, ToolResult, ToolUse
 from framework.llm.stream_events import (
    FinishEvent,
@@ -632,13 +632,20 @@ class AgentLoop(AgentProtocol):
        if isinstance(stream_id, str) and stream_id.startswith("worker:"):
            tools.append(build_report_to_parent_tool())

+        # Hide image-producing tools from text-only models so they never try
+        # to call them. Avoids wasted turns + "screenshot failed" lessons
+        # getting saved to memory. See framework.llm.capabilities.
+        _llm_model = ctx.llm.model if ctx.llm else ""
+        tools, _hidden_image_tools = filter_tools_for_model(tools, _llm_model)
+
        logger.info(
-            "[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s",
+            "[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s | hidden_image_tools=%s",
            node_id,
            len(tools),
            [t.name for t in tools],
            ctx.supports_direct_user_io,
            type(self._judge).__name__ if self._judge else "None",
+            _hidden_image_tools,
        )

        # 4. Publish loop started
@@ -1,5 +1,6 @@
 """Node definitions for Queen agent."""

+import re
 from pathlib import Path

 from framework.orchestrator import NodeSpec
@@ -32,6 +33,29 @@ def _build_appendices() -> str:
    return parts


+# Wraps prompt sections that should only be shown to vision-capable models.
+# Content inside `<!-- vision-only -->...<!-- /vision-only -->` is kept for
+# vision models and stripped for text-only models. Applied once per session
+# in queen_orchestrator.create_queen.
+_VISION_ONLY_BLOCK_RE = re.compile(
+    r"<!-- vision-only -->(.*?)<!-- /vision-only -->",
+    re.DOTALL,
+)
+
+
+def finalize_queen_prompt(text: str, has_vision: bool) -> str:
+    """Resolve `<!-- vision-only -->` blocks based on model capability.
+
+    For vision-capable models the markers are stripped and the inner
+    content is kept. For text-only models the whole block (markers +
+    content) is removed so the queen is never nudged toward tools it
+    cannot usefully invoke.
+    """
+    if has_vision:
+        return _VISION_ONLY_BLOCK_RE.sub(r"\1", text)
+    return _VISION_ONLY_BLOCK_RE.sub("", text)
+
+
 # Shared appendices — appended to every coding node's system prompt.
 _appendices = _build_appendices()

@@ -504,7 +528,7 @@ The queen writes final production-ready system prompts directly.

 MCP servers are loaded from the global registry by name. Available servers:
 - `hive_tools` — web search, email, CRM, calendar, 100+ integrations
- `gcu-tools` — browser automation (click, type, navigate, screenshot)
+- `gcu-tools` — browser automation (click, type, navigate<!-- vision-only -->, screenshot<!-- /vision-only -->)
 - `files-tools` — file I/O (read, write, edit, search, list)

 **Template variables:** Add a `variables:` section at the top of agent.json \
@@ -862,7 +886,7 @@ search_files, run_command, undo_changes

 ## Browser Automation (gcu-tools MCP)
 All browser tools are prefixed with `browser_` (browser_start, browser_navigate, \
-browser_click, browser_fill, browser_snapshot, browser_screenshot, browser_scroll, \
+browser_click, browser_fill, browser_snapshot, <!-- vision-only -->browser_screenshot, <!-- /vision-only -->browser_scroll, \
 browser_tabs, browser_close, browser_evaluate, etc.).
 Follow the browser-automation skill protocol — activate it before using browser tools.

@@ -21,7 +21,9 @@ All tools are prefixed with `browser_`:
 - `browser_click`, `browser_click_coordinate`, `browser_fill`, `browser_type` — interact
 - `browser_press` (with optional `modifiers=["ctrl"]` etc.) — keyboard shortcuts
 - `browser_snapshot` — compact accessibility-tree read (structured)
+<!-- vision-only -->
 - `browser_screenshot` — visual capture (annotated PNG)
+<!-- /vision-only -->
 - `browser_shadow_query`, `browser_get_rect` — locate elements (shadow-piercing via `>>>`)
 - `browser_coords` — convert image pixels to CSS pixels (always use `css_x/y`, never `physical_x/y`)
 - `browser_scroll`, `browser_wait` — navigation helpers
@@ -12,6 +12,11 @@ Vision support rules are derived from official vendor documentation:

 from __future__ import annotations

+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from framework.llm.provider import Tool
+

 def _model_name(model: str) -> str:
    """Return the bare model name after stripping any 'provider/' prefix."""
@@ -104,3 +109,22 @@ def supports_image_tool_results(model: str) -> bool:
    # 5. Default: assume vision capable
    #    Covers: OpenAI, Anthropic, Google, Mistral, Kimi, and other hosted providers
    return True
+
+
+def filter_tools_for_model(tools: list[Tool], model: str) -> tuple[list[Tool], list[str]]:
+    """Drop image-producing tools for text-only models.
+
+    Returns ``(filtered_tools, hidden_names)``. For vision-capable models
+    (or when *model* is empty) the input list is returned unchanged and
+    ``hidden_names`` is empty. For text-only models any tool with
+    ``produces_image=True`` is removed so the LLM never sees it in its
+    schema — avoids wasted calls and stale "screenshot failed" entries
+    in agent memory.
+    """
+    if not model or supports_image_tool_results(model):
+        return list(tools), []
+    hidden = [t.name for t in tools if t.produces_image]
+    if not hidden:
+        return list(tools), []
+    kept = [t for t in tools if not t.produces_image]
+    return kept, hidden
@@ -27,6 +27,9 @@ class Tool:
    name: str
    description: str
    parameters: dict[str, Any] = field(default_factory=dict)
+    # If True, the tool may return ImageContent in its result. Text-only models
+    # (e.g. glm-5, deepseek-chat) have this hidden from their schema entirely.
+    produces_image: bool = False
    # If True, this tool performs no filesystem/process/network writes and is
    # safe to run concurrently with other safe-flagged tools inside the same
    # assistant turn. Unsafe tools (writes, shell, browser actions) are always
@@ -7,6 +7,7 @@ import inspect
 import json
 import logging
 import os
+import re
 from collections.abc import Callable
 from dataclasses import dataclass
 from pathlib import Path
@@ -18,6 +19,16 @@ logger = logging.getLogger(__name__)

 _INPUT_LOG_MAX_LEN = 500

+# Tools whose names match this pattern are assumed to return ImageContent.
+# Matched against the bare tool name (case-insensitive). Used to mark MCP
+# tools with produces_image=True so they can be filtered out for text-only
+# models before the schema is ever shown to the LLM (avoids wasted calls
+# and "screenshot failed" entries polluting memory).
+_IMAGE_TOOL_NAME_RE = re.compile(
+    r"(screenshot|screen_capture|capture_image|render_image|get_image|snapshot_image)",
+    re.IGNORECASE,
+)
+
 # Per-execution context overrides.  Each asyncio task (and thus each
 # concurrent graph execution) gets its own copy, so there are no races
 # when multiple ExecutionStreams run in parallel.
@@ -998,6 +1009,7 @@ class ToolRegistry:
                "properties": properties,
                "required": required,
            },
+            produces_image=bool(_IMAGE_TOOL_NAME_RE.search(mcp_tool.name or "")),
            concurrency_safe=mcp_tool.name in self.CONCURRENCY_SAFE_TOOLS,
        )

@@ -311,7 +311,9 @@ async def create_queen(
        _queen_tools_running,
        _queen_tools_staging,
        _shared_building_knowledge,
+        finalize_queen_prompt,
    )
+    from framework.llm.capabilities import supports_image_tool_results
    from framework.host.event_bus import AgentEvent, EventType
    from framework.loader.mcp_registry import MCPRegistry
    from framework.loader.tool_registry import ToolRegistry
@@ -489,6 +491,13 @@ async def create_queen(
            "according to your current phase."
        )

+    # Resolve vision-only prompt sections based on the session's LLM.
+    # session.llm is immutable for the session's lifetime, so this check
+    # is stable — prompts never need to be recomposed mid-session.
+    _has_vision = bool(
+        session.llm and supports_image_tool_results(getattr(session.llm, "model", ""))
+    )
+
    _planning_body = (
        _queen_character_core
        + _queen_role_planning
@@ -500,7 +509,7 @@ async def create_queen(
        + _planning_knowledge
        + worker_identity
    )
-    phase_state.prompt_planning = _planning_body
+    phase_state.prompt_planning = finalize_queen_prompt(_planning_body, _has_vision)

    _building_body = (
        _queen_character_core
@@ -515,40 +524,52 @@ async def create_queen(
        + _appendices
        + worker_identity
    )
-    phase_state.prompt_building = _building_body
-    phase_state.prompt_staging = (
-        _queen_character_core
-        + _queen_role_staging
-        + _queen_style
-        + _queen_tools_staging
-        + _queen_behavior_always
-        + _queen_behavior_staging
-        + worker_identity
+    phase_state.prompt_building = finalize_queen_prompt(_building_body, _has_vision)
+    phase_state.prompt_staging = finalize_queen_prompt(
+        (
+            _queen_character_core
+            + _queen_role_staging
+            + _queen_style
+            + _queen_tools_staging
+            + _queen_behavior_always
+            + _queen_behavior_staging
+            + worker_identity
+        ),
+        _has_vision,
    )
-    phase_state.prompt_running = (
-        _queen_character_core
-        + _queen_role_running
-        + _queen_style
-        + _queen_tools_running
-        + _queen_behavior_always
-        + _queen_behavior_running
-        + worker_identity
+    phase_state.prompt_running = finalize_queen_prompt(
+        (
+            _queen_character_core
+            + _queen_role_running
+            + _queen_style
+            + _queen_tools_running
+            + _queen_behavior_always
+            + _queen_behavior_running
+            + worker_identity
+        ),
+        _has_vision,
    )
-    phase_state.prompt_editing = (
-        _queen_identity_editing
-        + _queen_style
-        + _queen_tools_editing
-        + _queen_behavior_always
-        + _queen_behavior_editing
-        + worker_identity
+    phase_state.prompt_editing = finalize_queen_prompt(
+        (
+            _queen_identity_editing
+            + _queen_style
+            + _queen_tools_editing
+            + _queen_behavior_always
+            + _queen_behavior_editing
+            + worker_identity
+        ),
+        _has_vision,
    )
-    phase_state.prompt_independent = (
-        _queen_character_core
-        + _queen_role_independent
-        + _queen_style
-        + _queen_tools_independent
-        + _queen_behavior_always
-        + _queen_behavior_independent
+    phase_state.prompt_independent = finalize_queen_prompt(
+        (
+            _queen_character_core
+            + _queen_role_independent
+            + _queen_style
+            + _queen_tools_independent
+            + _queen_behavior_always
+            + _queen_behavior_independent
+        ),
+        _has_vision,
    )

    # ---- Default skill protocols -------------------------------------
@@ -284,10 +284,16 @@ def _get_subscription_token(sub_id: str) -> str | None:
 def _hot_swap_sessions(
    request: web.Request, full_model: str, api_key: str | None, api_base: str | None
 ) -> int:
-    """Hot-swap the LLM on all running sessions. Returns count of swapped sessions."""
+    """Hot-swap the LLM on all running sessions. Returns count of swapped sessions.
+
+    Also refreshes the SessionManager's default model so that subsequent
+    one-shot LLM consumers (e.g. /messages/classify, new session bootstrap)
+    pick up the new provider/model instead of the stale startup override.
+    """
    from framework.server.session_manager import SessionManager

    manager: SessionManager = request.app["manager"]
+    manager._model = full_model
    swapped = 0
    for session in manager.list_sessions():
        llm_provider = getattr(session, "llm", None)
@@ -14,13 +14,37 @@ from framework.skills.skill_errors import SkillErrorCode, log_skill_error

 logger = logging.getLogger(__name__)

-_BEHAVIORAL_INSTRUCTION = (
-    "The following skills provide specialized instructions for specific tasks.\n"
-    "When a task matches a skill's description, read the SKILL.md at the listed\n"
-    "location to load the full instructions before proceeding.\n"
-    "When a skill references relative paths, resolve them against the skill's\n"
-    "directory (the parent of SKILL.md) and use absolute paths in tool calls."
-)
+# Upper bound on the raw `<available_skills>` XML body, in characters.
+# When the full catalog (with <description> entries) exceeds this, we fall
+# back to the compact variant that drops descriptions but keeps every skill
+# visible. Preserving awareness of every skill beats truncating entries.
+_COMPACT_THRESHOLD_CHARS = 5000
+
+_MANDATORY_HEADER_FULL = """## Skills (mandatory)
+Before replying: scan <available_skills> <description> entries.
+- If exactly one skill clearly applies: read its SKILL.md at <location> with `read_file`, then follow it.
+- If multiple could apply: choose the most specific one, then read/follow it.
+- If none clearly apply: do not read any SKILL.md.
+Constraints: never read more than one skill up front; only read after selecting.
+- When a skill drives external API writes (Gmail, Calendar, GitHub, etc.), assume rate limits: prefer fewer larger writes, avoid tight one-item loops, serialize bursts when possible, and respect 429/Retry-After.
+
+
+The following skills provide specialized instructions for specific tasks.
+Use `read_file` to load a skill's SKILL.md when the task matches its description.
+When a skill file references a relative path, resolve it against the skill directory (parent of SKILL.md) and use that absolute path in tool commands."""
+
+_MANDATORY_HEADER_COMPACT = """## Skills (mandatory)
+Before replying: scan <available_skills> <name> entries.
+- If exactly one skill clearly applies: read its SKILL.md at <location> with `read_file`, then follow it.
+- If multiple could apply: choose the most specific one, then read/follow it.
+- If none clearly apply: do not read any SKILL.md.
+Constraints: never read more than one skill up front; only read after selecting.
+- When a skill drives external API writes (Gmail, Calendar, GitHub, etc.), assume rate limits: prefer fewer larger writes, avoid tight one-item loops, serialize bursts when possible, and respect 429/Retry-After.
+
+
+The following skills provide specialized instructions for specific tasks.
+Use `read_file` to load a skill's SKILL.md when the task matches its name.
+When a skill file references a relative path, resolve it against the skill directory (parent of SKILL.md) and use that absolute path in tool commands."""


 class SkillCatalog:
@@ -61,27 +85,42 @@ class SkillCatalog:
    def to_prompt(self) -> str:
        """Generate the catalog prompt for system prompt injection.

-        Returns empty string if no community/user skills are discovered
-        (default skills are handled separately by DefaultSkillManager).
-        """
-        # All skills go through the catalog for progressive disclosure.
-        all_skills = list(self._skills.values())
+        Returns empty string when no skills are present. Otherwise returns
+        a mandatory pre-reply checklist + decision rules + rate-limit note,
+        followed by the <available_skills> XML body.

+        When the full XML body exceeds ``_COMPACT_THRESHOLD_CHARS``, the
+        compact variant is emitted instead: <description> elements are
+        dropped so every skill stays visible before any gets truncated.
+        """
+        all_skills = sorted(self._skills.values(), key=lambda s: s.name)
        if not all_skills:
            return ""

+        full_xml = self._render_xml(all_skills, compact=False)
+        if len(full_xml) <= _COMPACT_THRESHOLD_CHARS:
+            return f"{_MANDATORY_HEADER_FULL}\n\n{full_xml}"
+
+        compact_xml = self._render_xml(all_skills, compact=True)
+        return f"{_MANDATORY_HEADER_COMPACT}\n\n{compact_xml}"
+
+    @staticmethod
+    def _render_xml(skills: list[ParsedSkill], *, compact: bool) -> str:
+        """Render the `<available_skills>` block.
+
+        ``compact=True`` drops `<description>` to preserve skill awareness
+        when the catalog would otherwise blow the char budget.
+        """
        lines = ["<available_skills>"]
-        for skill in sorted(all_skills, key=lambda s: s.name):
+        for skill in skills:
            lines.append("  <skill>")
            lines.append(f"    <name>{escape(skill.name)}</name>")
-            lines.append(f"    <description>{escape(skill.description)}</description>")
+            if not compact:
+                lines.append(f"    <description>{escape(skill.description)}</description>")
            lines.append(f"    <location>{escape(skill.location)}</location>")
-            lines.append(f"    <base_dir>{escape(skill.base_dir)}</base_dir>")
            lines.append("  </skill>")
        lines.append("</available_skills>")
-
-        xml_block = "\n".join(lines)
-        return f"{_BEHAVIORAL_INSTRUCTION}\n\n{xml_block}"
+        return "\n".join(lines)

    def build_pre_activated_prompt(self, skill_names: list[str]) -> str:
        """Build prompt content for pre-activated skills.
@@ -212,6 +212,211 @@ function ToolActivityRow({ content }: { content: string }) {
  );
 }

+// --- Inline ask_user fallback ---------------------------------------------
+// Sometimes the model prints the ask_user / ask_user_multiple payload as
+// regular assistant text instead of invoking the tool. We detect that
+// payload here and render a QuestionWidget / MultiQuestionWidget inline so
+// the user still gets the nice button UI. Submissions are sent back as a
+// regular user message via onSend (there is no pending backend state to
+// fulfill, so we treat it like the user answering in chat).
+
+type AskUserInlinePayload =
+  | { kind: "single"; question: string; options: string[] }
+  | {
+      kind: "multi";
+      questions: { id: string; prompt: string; options?: string[] }[];
+    };
+
+function detectAskUserPayload(content: string): AskUserInlinePayload | null {
+  if (!content) return null;
+  let text = content.trim();
+  if (!text) return null;
+  // Strip an optional ```json ... ``` / ``` ... ``` code fence
+  const fence = text.match(/^```(?:json|JSON)?\s*([\s\S]*?)\s*```$/);
+  if (fence) text = fence[1].trim();
+  // Strip surrounding double quotes that fully wrap a JSON object
+  if (text.length >= 2 && text.startsWith('"') && text.endsWith('"')) {
+    const inner = text.slice(1, -1).trim();
+    if (inner.startsWith("{") && inner.endsWith("}")) text = inner;
+  }
+  if (!text.startsWith("{") || !text.endsWith("}")) return null;
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(text);
+  } catch {
+    return null;
+  }
+  if (!parsed || typeof parsed !== "object") return null;
+  const obj = parsed as Record<string, unknown>;
+
+  // ask_user_multiple: { questions: [{ id, prompt, options? }, ...] }
+  if (Array.isArray(obj.questions)) {
+    const raw = obj.questions as unknown[];
+    if (raw.length < 1 || raw.length > 8) return null;
+    const questions: { id: string; prompt: string; options?: string[] }[] = [];
+    for (let i = 0; i < raw.length; i++) {
+      const q = raw[i];
+      if (!q || typeof q !== "object") return null;
+      const qo = q as Record<string, unknown>;
+      const prompt =
+        typeof qo.prompt === "string"
+          ? qo.prompt
+          : typeof qo.question === "string"
+            ? qo.question
+            : null;
+      if (!prompt) return null;
+      const id = typeof qo.id === "string" && qo.id ? qo.id : `q${i}`;
+      let options: string[] | undefined;
+      if (
+        Array.isArray(qo.options) &&
+        qo.options.every((o) => typeof o === "string")
+      ) {
+        options = qo.options as string[];
+      }
+      questions.push({ id, prompt, options });
+    }
+    return { kind: "multi", questions };
+  }
+
+  // ask_user: { question: string, options: string[] }
+  const question = typeof obj.question === "string" ? obj.question : null;
+  const options =
+    Array.isArray(obj.options) &&
+    obj.options.every((o) => typeof o === "string")
+      ? (obj.options as string[])
+      : null;
+  if (!question || !options || options.length < 2) return null;
+  return { kind: "single", question, options };
+}
+
+function InlineAskUserBubble({
+  msg,
+  payload,
+  activeThread,
+  onSend,
+  queenPhase,
+  showQueenPhaseBadge = true,
+}: {
+  msg: ChatMessage;
+  payload: AskUserInlinePayload;
+  activeThread: string;
+  onSend: (
+    message: string,
+    thread: string,
+    images?: ImageContent[],
+  ) => void;
+  queenPhase?: "planning" | "building" | "staging" | "running" | "independent";
+  showQueenPhaseBadge?: boolean;
+}) {
+  const [state, setState] = useState<"pending" | "submitted" | "dismissed">(
+    "pending",
+  );
+
+  // Once the user submits an answer via the inline widget, hide the whole
+  // bubble — their reply appears right after as a normal user message.
+  if (state === "submitted") return null;
+
+  // If the user dismissed without answering, fall back to the regular
+  // MarkdownContent rendering so they can still see what the model said.
+  if (state === "dismissed") {
+    return (
+      <MessageBubble
+        msg={msg}
+        queenPhase={queenPhase}
+        showQueenPhaseBadge={showQueenPhaseBadge}
+      />
+    );
+  }
+
+  const isQueen = msg.role === "queen";
+  const color = getColor(msg.agent, msg.role);
+  const thread = msg.thread || activeThread;
+
+  const handleSingle = (answer: string) => {
+    setState("submitted");
+    onSend(answer, thread);
+  };
+
+  const handleMulti = (answers: Record<string, string>) => {
+    setState("submitted");
+    if (payload.kind !== "multi") return;
+    // Format answers as a readable, numbered list for the outgoing message.
+    const lines = payload.questions.map((q, i) => {
+      const a = answers[q.id] ?? "";
+      return `${i + 1}. ${q.prompt}\n   ${a}`;
+    });
+    onSend(lines.join("\n"), thread);
+  };
+
+  return (
+    <div className="flex gap-3">
+      <div
+        className={`flex-shrink-0 ${isQueen ? "w-9 h-9" : "w-7 h-7"} rounded-xl flex items-center justify-center`}
+        style={{
+          backgroundColor: `${color}18`,
+          border: `1.5px solid ${color}35`,
+          boxShadow: isQueen ? `0 0 12px ${color}20` : undefined,
+        }}
+      >
+        {isQueen ? (
+          <Crown className="w-4 h-4" style={{ color }} />
+        ) : (
+          <Cpu className="w-3.5 h-3.5" style={{ color }} />
+        )}
+      </div>
+      <div
+        className={`flex-1 min-w-0 ${isQueen ? "max-w-[85%]" : "max-w-[75%]"}`}
+      >
+        <div className="flex items-center gap-2 mb-1">
+          <span
+            className={`font-medium ${isQueen ? "text-sm" : "text-xs"}`}
+            style={{ color }}
+          >
+            {msg.agent}
+          </span>
+          {(!isQueen || showQueenPhaseBadge) && (
+            <span
+              className={`text-[10px] font-medium px-1.5 py-0.5 rounded-md ${
+                isQueen
+                  ? "bg-primary/15 text-primary"
+                  : "bg-muted text-muted-foreground"
+              }`}
+            >
+              {isQueen
+                ? (msg.phase ?? queenPhase) === "independent"
+                  ? "independent"
+                  : (msg.phase ?? queenPhase) === "running"
+                    ? "running"
+                    : (msg.phase ?? queenPhase) === "staging"
+                      ? "staging"
+                      : (msg.phase ?? queenPhase) === "planning"
+                        ? "planning"
+                        : "building"
+                : "Worker"}
+            </span>
+          )}
+        </div>
+        {payload.kind === "single" ? (
+          <QuestionWidget
+            inline
+            question={payload.question}
+            options={payload.options}
+            onSubmit={handleSingle}
+            onDismiss={() => setState("dismissed")}
+          />
+        ) : (
+          <MultiQuestionWidget
+            inline
+            questions={payload.questions}
+            onSubmit={handleMulti}
+            onDismiss={() => setState("dismissed")}
+          />
+        )}
+      </div>
+    </div>
+  );
+}
+
 const MessageBubble = memo(
  function MessageBubble({
    msg,
@@ -596,24 +801,51 @@ export default function ChatPanel({
        onScroll={handleScroll}
        className="flex-1 overflow-auto px-5 py-4 space-y-3"
      >
-        {renderItems.map((item) =>
-          item.kind === "parallel" ? (
-            <div key={item.groupId}>
-              <ParallelSubagentBubble
-                groupId={item.groupId}
-                groups={item.groups}
-              />
-            </div>
-          ) : (
-            <div key={item.msg.id}>
+        {renderItems.map((item) => {
+          if (item.kind === "parallel") {
+            return (
+              <div key={item.groupId}>
+                <ParallelSubagentBubble
+                  groupId={item.groupId}
+                  groups={item.groups}
+                />
+              </div>
+            );
+          }
+          const msg = item.msg;
+          // Detect misformatted ask_user payloads emitted as plain text and
+          // substitute the nicer widget-based bubble.  Only inspect regular
+          // agent messages — skip system rows, tool status, dividers, etc.
+          const askPayload =
+            (msg.role === "queen" || msg.role === "worker") &&
+            !msg.type &&
+            msg.content
+              ? detectAskUserPayload(msg.content)
+              : null;
+          if (askPayload) {
+            return (
+              <div key={msg.id}>
+                <InlineAskUserBubble
+                  msg={msg}
+                  payload={askPayload}
+                  activeThread={activeThread}
+                  onSend={onSend}
+                  queenPhase={queenPhase}
+                  showQueenPhaseBadge={showQueenPhaseBadge}
+                />
+              </div>
+            );
+          }
+          return (
+            <div key={msg.id}>
              <MessageBubble
-                msg={item.msg}
+                msg={msg}
                queenPhase={queenPhase}
                showQueenPhaseBadge={showQueenPhaseBadge}
              />
            </div>
-          ),
-        )}
+          );
+        })}

        {/* Show typing indicator while waiting for first queen response (disabled + empty chat) */}
        {(isWaiting || (disabled && threadMessages.length === 0)) && (
@@ -11,9 +11,15 @@ export interface MultiQuestionWidgetProps {
  questions: QuestionItem[];
  onSubmit: (answers: Record<string, string>) => void;
  onDismiss?: () => void;
+  /**
+   * When true, skip the global Enter-to-submit listener. Use this when rendering
+   * the widget inline alongside other inputs (e.g. the chat textarea) so Enter
+   * isn't hijacked from the surrounding UI.
+   */
+  inline?: boolean;
 }

-export default function MultiQuestionWidget({ questions, onSubmit, onDismiss }: MultiQuestionWidgetProps) {
+export default function MultiQuestionWidget({ questions, onSubmit, onDismiss, inline = false }: MultiQuestionWidgetProps) {
  // Per-question state: selected index (null = nothing, options.length = "Other")
  const [selections, setSelections] = useState<(number | null)[]>(
    () => questions.map(() => null),
@@ -50,8 +56,10 @@ export default function MultiQuestionWidget({ questions, onSubmit, onDismiss }:
    onSubmit(answers);
  }, [canSubmit, submitted, questions, selections, customTexts, onSubmit]);

-  // Enter to submit (only when not focused on a text input)
+  // Enter to submit (only when not focused on a text input).
+  // Skipped in inline mode so the widget doesn't hijack keys from surrounding inputs.
  useEffect(() => {
+    if (inline) return;
    const handleKeyDown = (e: KeyboardEvent) => {
      if (submitted) return;
      const target = e.target as HTMLElement;
@@ -63,7 +71,7 @@ export default function MultiQuestionWidget({ questions, onSubmit, onDismiss }:
    };
    window.addEventListener("keydown", handleKeyDown);
    return () => window.removeEventListener("keydown", handleKeyDown);
-  }, [handleSubmit, submitted]);
+  }, [handleSubmit, submitted, inline]);

  if (submitted) return null;

@@ -10,9 +10,15 @@ export interface QuestionWidgetProps {
  onSubmit: (answer: string, isOther: boolean) => void;
  /** Called when user dismisses the question without answering */
  onDismiss?: () => void;
+  /**
+   * When true, the widget does not register a global keyboard listener. Set this
+   * when rendering the widget inline alongside other inputs (e.g. a chat textarea)
+   * so Enter / number keys do not get hijacked from the surrounding UI.
+   */
+  inline?: boolean;
 }

-export default function QuestionWidget({ question, options, onSubmit, onDismiss }: QuestionWidgetProps) {
+export default function QuestionWidget({ question, options, onSubmit, onDismiss, inline = false }: QuestionWidgetProps) {
  const [selected, setSelected] = useState<number | null>(null);
  const [customText, setCustomText] = useState("");
  const [submitted, setSubmitted] = useState(false);
@@ -42,8 +48,10 @@ export default function QuestionWidget({ question, options, onSubmit, onDismiss
    }
  }, [canSubmit, submitted, isOtherSelected, customText, options, selected, onSubmit]);

-  // Keyboard: Enter to submit, number keys to select (only when text input is not focused)
+  // Keyboard: Enter to submit, number keys to select (only when text input is not focused).
+  // Skipped in inline mode so the widget doesn't hijack keys from surrounding inputs.
  useEffect(() => {
+    if (inline) return;
    const handleKeyDown = (e: KeyboardEvent) => {
      if (submitted) return;
      const inTextInput = e.target === inputRef.current;
@@ -66,7 +74,7 @@ export default function QuestionWidget({ question, options, onSubmit, onDismiss

    window.addEventListener("keydown", handleKeyDown);
    return () => window.removeEventListener("keydown", handleKeyDown);
-  }, [handleSubmit, submitted, options.length]);
+  }, [handleSubmit, submitted, options.length, inline]);

  if (submitted) return null;

@@ -238,6 +238,12 @@ export default function ColonyChat() {
  agentStateRef.current = agentState;

  const turnCounterRef = useRef<Record<string, number>>({});
+  // Maps tool_use_id → the pill message ID and tool name that was created for it.
+  // Survives turn counter resets so deferred completions (e.g. ask_user) can
+  // find and update the correct pill even after the counter changes.
+  const toolUseToPillRef = useRef<
+    Record<string, { msgId: string; name: string }>
+  >({});
  const queenPhaseRef = useRef<string>("planning");
  const queenIterTextRef = useRef<Record<string, Record<number, string>>>({});
  const suppressIntroRef = useRef(false);
@@ -468,6 +474,7 @@ export default function ColonyChat() {
      setGraphNodes([]);
      setAgentState(defaultAgentState());
      turnCounterRef.current = {};
+      toolUseToPillRef.current = {};
      queenPhaseRef.current = "planning";
      queenIterTextRef.current = {};
      suppressIntroRef.current = false;
@@ -782,6 +789,12 @@ export default function ColonyChat() {
            const toolUseId = (event.data?.tool_use_id as string) || "";

            const sid = event.stream_id;
+            // Track which pill message this tool belongs to so deferred
+            // completions (ask_user) can find it after the turn counter changes.
+            toolUseToPillRef.current[toolUseId] = {
+              msgId: `tool-pill-${sid}-${event.execution_id || "exec"}-${currentTurn}`,
+              name: toolName,
+            };
            setAgentState((prev) => {
              const newActive = {
                ...prev.activeToolCalls,
@@ -826,30 +839,73 @@ export default function ColonyChat() {
              appendNodeLog(event.node_id, `${ts} INFO  ${toolName} done${resultStr}`);
            }

+            // Look up the original pill message this tool belongs to.
+            // For deferred completions (ask_user), the turn counter and
+            // activeToolCalls have already been reset, so we rely on the
+            // ref recorded during tool_call_started.
+            const tracked = toolUseToPillRef.current[toolUseId];
+            delete toolUseToPillRef.current[toolUseId];
+
            const sid = event.stream_id;
+
+            // Mark done in activeToolCalls if still present (normal case)
            setAgentState((prev) => {
-              const updated = { ...prev.activeToolCalls };
-              if (updated[toolUseId]) {
-                updated[toolUseId] = { ...updated[toolUseId], done: true };
+              if (!prev.activeToolCalls[toolUseId]) return prev;
+              return {
+                ...prev,
+                activeToolCalls: {
+                  ...prev.activeToolCalls,
+                  [toolUseId]: {
+                    ...prev.activeToolCalls[toolUseId],
+                    done: true,
+                  },
+                },
+              };
+            });
+
+            // Determine the correct pill message ID
+            const pillMsgId =
+              tracked?.msgId ??
+              `tool-pill-${sid}-${event.execution_id || "exec"}-${currentTurn}`;
+            const trackedName = tracked?.name;
+
+            // Update the pill message content directly
+            setMessages((prevMsgs) => {
+              const idx = prevMsgs.findIndex((m) => m.id === pillMsgId);
+              if (idx < 0) return prevMsgs;
+
+              try {
+                const parsed = JSON.parse(prevMsgs[idx].content);
+                const tools: { name: string; done: boolean }[] =
+                  parsed.tools || [];
+
+                if (trackedName) {
+                  let marked = false;
+                  for (let i = 0; i < tools.length; i++) {
+                    if (
+                      tools[i].name === trackedName &&
+                      !tools[i].done &&
+                      !marked
+                    ) {
+                      tools[i] = { ...tools[i], done: true };
+                      marked = true;
+                    }
+                  }
+                }
+
+                const allDone =
+                  tools.length > 0 && tools.every((t) => t.done);
+                return prevMsgs.map((m, i) =>
+                  i === idx
+                    ? {
+                        ...m,
+                        content: JSON.stringify({ tools, allDone }),
+                      }
+                    : m,
+                );
+              } catch {
+                return prevMsgs;
              }
-              const tools = Object.values(updated)
-                .filter((t) => t.streamId === sid)
-                .map((t) => ({ name: t.name, done: t.done }));
-              const allDone = tools.length > 0 && tools.every((t) => t.done);
-              upsertMessage({
-                id: `tool-pill-${sid}-${event.execution_id || "exec"}-${currentTurn}`,
-                agent: agentDisplayName || event.node_id || "Agent",
-                agentColor: "",
-                content: JSON.stringify({ tools, allDone }),
-                timestamp: "",
-                type: "tool_status",
-                role,
-                thread: agentPath,
-                createdAt: eventCreatedAt,
-                nodeId: event.node_id || undefined,
-                executionId: event.execution_id || undefined,
-              });
-              return { ...prev, activeToolCalls: updated };
            });
          }
          break;
@@ -58,6 +58,12 @@ export default function QueenDM() {
  const [cloneTask, setCloneTask] = useState("");

  const turnCounterRef = useRef(0);
+  // Maps tool_use_id → the pill message ID and tool name that was created for it.
+  // Survives turn counter resets so deferred completions (e.g. ask_user) can
+  // find and update the correct pill even after llm_turn_complete bumps the counter.
+  const toolUseToPillRef = useRef<
+    Record<string, { msgId: string; name: string }>
+  >({});
  const queenIterTextRef = useRef<Record<string, Record<number, string>>>({});
  const [queenPhase, setQueenPhase] = useState<
    "planning" | "building" | "staging" | "running" | "independent"
@@ -77,6 +83,7 @@ export default function QueenDM() {
    setQueenPhase("independent");
    setInitialDraft(null);
    turnCounterRef.current = 0;
+    toolUseToPillRef.current = {};
    queenIterTextRef.current = {};
  }, []);

@@ -390,6 +397,7 @@ export default function QueenDM() {
          setIsTyping(true);
          setQueenReady(true);
          setActiveToolCalls({});
+          toolUseToPillRef.current = {};
          // Clear queued flag on all user messages now that the queen is processing
          setMessages((prev) => {
            if (!prev.some((m) => m.queued)) return prev;
@@ -560,6 +568,11 @@ export default function QueenDM() {
            ? new Date(event.timestamp).getTime()
            : Date.now();

+          // Track which pill message this tool belongs to so deferred
+          // completions (ask_user) can find it after the turn counter changes.
+          const msgId = `tool-pill-${sid}-${execId}-${turnCounterRef.current}`;
+          toolUseToPillRef.current[toolUseId] = { msgId, name: toolName };
+
          setActiveToolCalls((prev) => {
            const newActive = {
              ...prev,
@@ -570,7 +583,6 @@ export default function QueenDM() {
              done: t.done,
            }));
            const allDone = tools.length > 0 && tools.every((t) => t.done);
-            const msgId = `tool-pill-${sid}-${execId}-${turnCounterRef.current}`;
            const toolMsg: ChatMessage = {
              id: msgId,
              agent: queenName,
@@ -611,57 +623,68 @@ export default function QueenDM() {

        case "tool_call_completed": {
          const toolUseId = (event.data?.tool_use_id as string) || "";
+
+          // Look up the original pill message this tool belongs to.
+          // For deferred completions (ask_user), the turn counter and
+          // activeToolCalls have already been reset by llm_turn_complete,
+          // so we rely on the ref recorded during tool_call_started.
+          const tracked = toolUseToPillRef.current[toolUseId];
+          delete toolUseToPillRef.current[toolUseId];
+
+          // Mark done in activeToolCalls if still present (normal case)
+          setActiveToolCalls((prev) => {
+            if (!prev[toolUseId]) return prev;
+            return {
+              ...prev,
+              [toolUseId]: { ...prev[toolUseId], done: true },
+            };
+          });
+
+          // Determine the correct pill message ID
          const sid = event.stream_id;
          const execId = event.execution_id || "exec";
-          const eventCreatedAt = event.timestamp
-            ? new Date(event.timestamp).getTime()
-            : Date.now();
+          const pillMsgId =
+            tracked?.msgId ??
+            `tool-pill-${sid}-${execId}-${turnCounterRef.current}`;
+          const toolName = tracked?.name;

-          setActiveToolCalls((prev) => {
-            const updated = { ...prev };
-            if (updated[toolUseId]) {
-              updated[toolUseId] = { ...updated[toolUseId], done: true };
+          // Update the pill message content directly
+          setMessages((prevMsgs) => {
+            const idx = prevMsgs.findIndex((m) => m.id === pillMsgId);
+            if (idx < 0) return prevMsgs;
+
+            try {
+              const parsed = JSON.parse(prevMsgs[idx].content);
+              const tools: { name: string; done: boolean }[] =
+                parsed.tools || [];
+
+              if (toolName) {
+                let marked = false;
+                for (let i = 0; i < tools.length; i++) {
+                  if (
+                    tools[i].name === toolName &&
+                    !tools[i].done &&
+                    !marked
+                  ) {
+                    tools[i] = { ...tools[i], done: true };
+                    marked = true;
+                  }
+                }
+              }
+
+              const allDone =
+                tools.length > 0 && tools.every((t) => t.done);
+              return prevMsgs.map((m, i) =>
+                i === idx
+                  ? {
+                      ...m,
+                      content: JSON.stringify({ tools, allDone }),
+                    }
+                  : m,
+              );
+            } catch {
+              return prevMsgs;
            }
-            const tools = Object.entries(updated).map(([, t]) => ({
-              name: t.name,
-              done: t.done,
-            }));
-            const allDone = tools.length > 0 && tools.every((t) => t.done);
-            const msgId = `tool-pill-${sid}-${execId}-${turnCounterRef.current}`;
-            const toolMsg: ChatMessage = {
-              id: msgId,
-              agent: queenName,
-              agentColor: "",
-              content: JSON.stringify({ tools, allDone }),
-              timestamp: "",
-              type: "tool_status",
-              role: "queen",
-              thread: "queen-dm",
-              createdAt: eventCreatedAt,
-              nodeId: event.node_id || undefined,
-              executionId: event.execution_id || undefined,
-            };
-            setMessages((prevMsgs) => {
-              const idx = prevMsgs.findIndex((m) => m.id === msgId);
-              if (idx >= 0) {
-                return prevMsgs.map((m, i) =>
-                  i === idx ? { ...toolMsg, createdAt: m.createdAt ?? toolMsg.createdAt } : m,
-                );
-              }
-              // Insert in sorted position by createdAt
-              const ts = toolMsg.createdAt ?? Date.now();
-              let insertIdx = prevMsgs.length - 1;
-              while (insertIdx >= 0 && (prevMsgs[insertIdx].createdAt ?? 0) > ts) {
-                insertIdx--;
-              }
-              if (insertIdx === -1 || insertIdx === prevMsgs.length - 1) {
-                return [...prevMsgs, toolMsg];
-              }
-              const next = [...prevMsgs];
-              next.splice(insertIdx + 1, 0, toolMsg);
-              return next;
-            });
-            return updated;
          });
          break;
        }
@@ -746,6 +769,7 @@ export default function QueenDM() {
      setIsTyping(false);
      setIsStreaming(false);
      setActiveToolCalls({});
+      toolUseToPillRef.current = {};
      // Clear queued flags since the queen is now idle
      setMessages((prev) => {
        if (!prev.some((m) => m.queued)) return prev;
@@ -4,7 +4,8 @@ from __future__ import annotations

 import pytest

-from framework.llm.capabilities import supports_image_tool_results
+from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results
+from framework.llm.provider import Tool


 class TestSupportsImageToolResults:
@@ -56,3 +57,56 @@ class TestSupportsImageToolResults:
        assert supports_image_tool_results("DeepSeek/deepseek-chat") is False
        assert supports_image_tool_results("OLLAMA/llama3") is False
        assert supports_image_tool_results("GPT-4o") is True
+
+
+class TestFilterToolsForModel:
+    """Verify ``filter_tools_for_model`` — the real helper used by AgentLoop."""
+
+    def test_hides_image_tool_from_text_only_model(self):
+        tools = [
+            Tool(name="read_file", description="read a file"),
+            Tool(name="browser_screenshot", description="take a screenshot", produces_image=True),
+            Tool(name="browser_snapshot", description="get page content"),
+        ]
+        filtered, hidden = filter_tools_for_model(tools, "glm-5")
+        names = [t.name for t in filtered]
+        assert "browser_screenshot" not in names
+        assert "read_file" in names
+        assert "browser_snapshot" in names
+        assert hidden == ["browser_screenshot"]
+
+    def test_keeps_image_tool_for_vision_model(self):
+        tools = [
+            Tool(name="read_file", description="read a file"),
+            Tool(name="browser_screenshot", description="take a screenshot", produces_image=True),
+        ]
+        filtered, hidden = filter_tools_for_model(tools, "claude-sonnet-4-20250514")
+        assert {t.name for t in filtered} == {"read_file", "browser_screenshot"}
+        assert hidden == []
+
+    def test_default_tools_are_not_filtered(self):
+        """Tools without produces_image (default False) are kept for all models."""
+        tools = [
+            Tool(name="read_file", description="read a file"),
+            Tool(name="web_search", description="search the web"),
+        ]
+        text_only, text_hidden = filter_tools_for_model(tools, "glm-5")
+        vision, vision_hidden = filter_tools_for_model(tools, "gpt-4o")
+        assert len(text_only) == 2 and text_hidden == []
+        assert len(vision) == 2 and vision_hidden == []
+
+    def test_empty_model_string_returns_tools_unchanged(self):
+        """Guards the ctx.llm-missing path where model is empty."""
+        tools = [
+            Tool(name="browser_screenshot", description="", produces_image=True),
+        ]
+        filtered, hidden = filter_tools_for_model(tools, "")
+        assert len(filtered) == 1
+        assert hidden == []
+
+    def test_returned_list_is_a_copy(self):
+        """Caller should be free to mutate the filtered list without affecting input."""
+        tools = [Tool(name="read_file", description="")]
+        filtered, _ = filter_tools_for_model(tools, "gpt-4o")
+        filtered.append(Tool(name="extra", description=""))
+        assert len(tools) == 1
@@ -0,0 +1,65 @@
+"""Tests for vision-only prompt block stripping in Queen nodes.
+
+Covers ``finalize_queen_prompt`` — the function that resolves
+``<!-- vision-only -->...<!-- /vision-only -->`` markers in Queen phase
+prompts before they reach the LLM. Vision-capable models see the inner
+content; text-only models see the block removed entirely.
+"""
+
+from __future__ import annotations
+
+from framework.agents.queen.nodes import finalize_queen_prompt
+
+
+class TestFinalizeQueenPrompt:
+    def test_vision_model_keeps_inner_content_and_strips_markers(self):
+        text = "before <!-- vision-only -->secret<!-- /vision-only --> after"
+        result = finalize_queen_prompt(text, has_vision=True)
+        assert result == "before secret after"
+
+    def test_text_only_model_removes_entire_block(self):
+        text = "before <!-- vision-only -->secret<!-- /vision-only --> after"
+        result = finalize_queen_prompt(text, has_vision=False)
+        assert result == "before  after"
+        assert "secret" not in result
+        assert "vision-only" not in result
+
+    def test_multiline_block_handled(self):
+        """Regex must use DOTALL so blocks can span newlines."""
+        text = (
+            "- item 1\n"
+            "<!-- vision-only -->\n"
+            "- item 2 (vision only)\n"
+            "<!-- /vision-only -->\n"
+            "- item 3\n"
+        )
+        vision = finalize_queen_prompt(text, has_vision=True)
+        text_only = finalize_queen_prompt(text, has_vision=False)
+        assert "- item 2 (vision only)" in vision
+        assert "- item 2 (vision only)" not in text_only
+        assert "- item 1" in text_only and "- item 3" in text_only
+
+    def test_multiple_blocks_in_same_text(self):
+        text = (
+            "A <!-- vision-only -->X<!-- /vision-only --> "
+            "B <!-- vision-only -->Y<!-- /vision-only --> C"
+        )
+        assert finalize_queen_prompt(text, has_vision=True) == "A X B Y C"
+        assert finalize_queen_prompt(text, has_vision=False) == "A  B  C"
+
+    def test_non_greedy_match_does_not_swallow_between_blocks(self):
+        """A naïve greedy regex would match from the first opening marker
+        to the last closing marker and wipe out the middle section. Lock
+        that down so a future refactor can't regress to greedy."""
+        text = (
+            "<!-- vision-only -->first<!-- /vision-only -->"
+            "KEEP"
+            "<!-- vision-only -->second<!-- /vision-only -->"
+        )
+        assert finalize_queen_prompt(text, has_vision=False) == "KEEP"
+        assert finalize_queen_prompt(text, has_vision=True) == "firstKEEPsecond"
+
+    def test_text_without_markers_is_unchanged(self):
+        text = "plain prompt with no markers at all"
+        assert finalize_queen_prompt(text, has_vision=True) == text
+        assert finalize_queen_prompt(text, has_vision=False) == text
@@ -94,7 +94,10 @@ class TestSkillCatalog:
        assert "<name>beta</name>" in prompt
        assert "<description>Alpha skill</description>" in prompt
        assert "<location>/p/alpha/SKILL.md</location>" in prompt
-        assert "<base_dir>/p/alpha</base_dir>" in prompt
+        # <base_dir> is intentionally not emitted — the mandatory header
+        # tells the model to resolve relative paths against the parent of
+        # SKILL.md, so the redundant element was dropped.
+        assert "<base_dir>" not in prompt

    def test_to_prompt_sorted_by_name(self):
        skills = [
@@ -130,13 +133,44 @@ class TestSkillCatalog:
        assert "<name>usr</name>" in prompt
        assert "<name>fw</name>" in prompt

-    def test_to_prompt_contains_behavioral_instruction(self):
+    def test_to_prompt_contains_mandatory_header(self):
+        """The rendered catalog must carry the mandatory pre-reply checklist
+        so soft guidance turns into a required step."""
        catalog = SkillCatalog([_make_skill(source_scope="project")])
        prompt = catalog.to_prompt()

-        assert "When a task matches a skill's description" in prompt
+        assert "## Skills (mandatory)" in prompt
+        assert "Before replying: scan <available_skills>" in prompt
+        assert "never read more than one skill up front" in prompt
+        assert "`read_file`" in prompt
        assert "SKILL.md" in prompt

+    def test_to_prompt_compact_fallback_drops_descriptions(self):
+        """When the full XML body exceeds the char threshold, the compact
+        variant drops <description> but keeps every skill's <name>."""
+        # Each skill contributes ~100+ chars with a long description.
+        # 60 skills easily pushes the body past the threshold.
+        skills = [
+            _make_skill(
+                name=f"skill-{i:03d}",
+                description="A reasonably long description " * 4,
+                location=f"/s/skill-{i:03d}/SKILL.md",
+                base_dir=f"/s/skill-{i:03d}",
+            )
+            for i in range(60)
+        ]
+        catalog = SkillCatalog(skills)
+        prompt = catalog.to_prompt()
+
+        # Mandatory header still present but uses the compact variant wording.
+        assert "## Skills (mandatory)" in prompt
+        assert "scan <available_skills> <name>" in prompt
+        # Every skill's name survives …
+        for i in range(60):
+            assert f"<name>skill-{i:03d}</name>" in prompt
+        # … but no descriptions were rendered.
+        assert "<description>" not in prompt
+
    def test_build_pre_activated_prompt(self):
        skill = _make_skill("research", body="## Deep Research\nDo thorough research.")
        catalog = SkillCatalog([skill])
@@ -1,9 +1,14 @@
 """Tests for AS-6 skill resource loading support.

 Covers:
- <base_dir> element in catalog XML
 - allowlisted_dirs property reflects trusted skill base directories
 - skill_dirs propagation to NodeContext
+
+The catalog XML previously emitted a redundant <base_dir> element next to
+each <location>. That was dropped when the mandatory header took over the
+"resolve relative paths against the parent of SKILL.md" instruction, so
+there is no longer an XML-emission test for base_dir. Programmatic access
+via ``catalog.allowlisted_dirs`` is still covered below.
 """

 from framework.skills.catalog import SkillCatalog
@@ -26,31 +31,6 @@ def _make_skill(


 class TestSkillResourceBaseDir:
-    def test_base_dir_in_xml(self):
-        """Each community skill entry should expose its base_dir in the catalog XML."""
-        skill = _make_skill("deploy", "/project/.hive/skills/deploy")
-        catalog = SkillCatalog([skill])
-        prompt = catalog.to_prompt()
-
-        assert "<base_dir>/project/.hive/skills/deploy</base_dir>" in prompt
-
-    def test_base_dir_xml_escaped(self):
-        """base_dir with XML-special chars should be escaped."""
-        skill = _make_skill("s", "/path/with <&> chars")
-        catalog = SkillCatalog([skill])
-        prompt = catalog.to_prompt()
-
-        assert "<base_dir>/path/with &lt;&amp;&gt; chars</base_dir>" in prompt
-
-    def test_base_dir_present_for_framework_skills(self):
-        """Framework-scope skills now appear in the catalog like any other scope,
-        and their base_dir is included in the XML."""
-        skill = _make_skill("fw", "/hive/_default_skills/fw", source_scope="framework")
-        catalog = SkillCatalog([skill])
-        prompt = catalog.to_prompt()
-        assert "<name>fw</name>" in prompt
-        assert "<base_dir>/hive/_default_skills/fw</base_dir>" in prompt
-
    def test_allowlisted_dirs_matches_skills(self):
        """allowlisted_dirs returns all skill base_dirs including framework ones."""
        skills = [
@@ -799,6 +799,55 @@ def test_resync_returns_false_when_credentials_unchanged(tmp_path, monkeypatch):
    assert registry.resync_mcp_servers_if_needed() is False


+class TestMcpToolProducesImageFlag:
+    """Verify _convert_mcp_tool_to_framework_tool sets produces_image from the name.
+
+    This is the detection step that the filter in AgentLoop depends on —
+    if the regex regresses, text-only models will start seeing screenshot
+    tools they can't use.
+    """
+
+    @staticmethod
+    def _mcp_tool(name: str):
+        return SimpleNamespace(
+            name=name,
+            description=f"{name} description",
+            input_schema={"type": "object", "properties": {}, "required": []},
+            server_name="test",
+        )
+
+    def test_screenshot_flagged(self):
+        registry = ToolRegistry()
+        mcp = self._mcp_tool("browser_screenshot")
+        tool = registry._convert_mcp_tool_to_framework_tool(mcp)  # noqa: SLF001
+        assert tool.produces_image is True
+
+    def test_snapshot_not_flagged(self):
+        """browser_snapshot returns a DOM tree, not an image — must not match."""
+        registry = ToolRegistry()
+        mcp = self._mcp_tool("browser_snapshot")
+        tool = registry._convert_mcp_tool_to_framework_tool(mcp)  # noqa: SLF001
+        assert tool.produces_image is False
+
+    def test_case_insensitive_match(self):
+        registry = ToolRegistry()
+        mcp = self._mcp_tool("TakeScreenshot")
+        tool = registry._convert_mcp_tool_to_framework_tool(mcp)  # noqa: SLF001
+        assert tool.produces_image is True
+
+    def test_plain_tool_not_flagged(self):
+        registry = ToolRegistry()
+        mcp = self._mcp_tool("read_file")
+        tool = registry._convert_mcp_tool_to_framework_tool(mcp)  # noqa: SLF001
+        assert tool.produces_image is False
+
+    def test_image_suffix_variants_flagged(self):
+        registry = ToolRegistry()
+        for name in ("capture_image", "render_image", "get_image", "snapshot_image"):
+            tool = registry._convert_mcp_tool_to_framework_tool(self._mcp_tool(name))  # noqa: SLF001
+            assert tool.produces_image is True, f"{name} should be flagged"
+
+
 # ---------------------------------------------------------------------------
 # Concurrency-safe flag propagation
 # ---------------------------------------------------------------------------
@@ -0,0 +1,176 @@
+# 🐝 Hive Agent v0.10.0: The Colony
+
+> ⚠️ **Breaking change.** This is a large architectural refactor of how agents work in Hive. **Old agents are no longer compatible.** Existing workspaces, custom agents, and saved sessions from pre-v0.10.0 builds will need to be recreated.
+
+---
+
+## ✨ Highlights
+
+The **Colony** introduces a new way of working: a group of specialized workers operating together to run and scale your business.
+
+The role of the **Queen** has evolved. Instead of only orchestrating, the Queen now **executes work first** to deliver immediate value, then **builds systems around that work** to create stable, repeatable business processes.
+
+You now have a full leadership team of eight Queens, each with their own identity, expertise, and voice:
+
+| Queen | Role |
+| --- | --- |
+| **Sophia** | Head of Brand & Design |
+| **Charlotte** | Head of Finance & Fundraising |
+| **Victoria** | Head of Growth |
+| **Eleanor** | Head of Legal |
+| **Rachel** | Head of Operations |
+| **Isabella** | Head of Product Strategy |
+| **Amelia** | Head of Talent |
+| **Alexandra** | Head of Technology |
+
+Start automating your business processes with your Queens today.
+
+---
+
+## 🏛️ The Colony Architecture
+
+### Queens as Identities, Not Just Orchestrators
+
+- **Queen profiles** — each queen is a YAML-backed persona (`~/.hive/agents/queens/{queen_id}/profile.yaml`) with core traits, hidden background, psychological profile, behavior triggers, and skill sets. Profiles are injected into the system prompt at session start.
+- **CEO-style queen selection** — an LLM classifier routes every new user request to the best-matching queen based on the task at hand, with structured routing diagnostics (`QueenSelection`).
+- **Queen DMs** — direct-message pages for each queen with a dedicated session flow, session switcher, and prompt library integration.
+- **Independent / PM mode** — queens run in an independent mode for planning-phase work, with a "think out loud" internal monologue surfaced through internal tags.
+- **Queen memory v2** — simplified memory implementation with reflection agent, cooldown-gated reflections, user identity, doppelganger wiring, and recall-selector for targeted retrieval.
+- **Queen lifecycle tools** — first-class tools for escalation, queen reply, and session handoff.
+
+### Colony Runtime
+
+- **Grand architecture revamp** — the framework, agent loop, runtime, graph, pipeline, executor, and node worker layers have been rewritten from the ground up. Deprecated shims and legacy orchestration paths have been removed.
+- **Colony creation flow** — colonies are created via skill, with reliable event bus subscription, worker spawning, and post-creation list refresh.
+- **Scheduled triggers** — colonies can now be woken on a cron schedule, with triggers firing directly into the owning queen's session.
+- **Simple fork** for agents, stable credential states, and improved worker execution reliability.
+
+---
+
+## 🆕 What's New
+
+### Colony & Queens
+
+- 8 default queen personas (Alexandra, Victoria, Isabella, Charlotte, Eleanor, Sophia, Amelia, Rachel) with profile YAML, examples, and behavior triggers
+- LLM-based queen selector with reasoning output
+- Queen DM page, queen session switcher, and sidebar queen item
+- Queen scope memory, role examples, and identity loading
+- Reflection agent with cooldown and improved reflection runner
+- Queen orchestrator + `routes_queens` API
+- Natural chat replies and cleaner home-prompt bootstrap
+- Queen identity for new sessions
+- `ask_user` / `ask_user_multiple` tools available in queen prompt
+- Escalation and queen-reply tools
+
+### Skills & Tools
+
+- **Learned default skills** — skills the queen has learned become part of her baseline
+- **Tool-gated skill activation** — skills only activate when their required tools are present
+- **Skills for colonies** — per-colony skill registration and loading
+- **Text-only model filter** — image-producing tools and vision-only prompt blocks are hidden from text-only models
+- **Browser skills upgrade** — improved click reliability, screenshot capture, and credential filtering
+- **Deprecated-tool removal** and alignment of Hive tool names across the codebase
+- **Ask-user widget** with fallback rendering and preserved tool pill mapping across turn boundaries for deferred completions
+- **Improved tool-call reliability** across the board (tool limit removed, tool blacklist, tool credential filter)
+- **MCP** — efficient MCP loading at initialization, default MCP bootstrapping, registered available MCP tools, fixed MCP tool initialization and registry pipeline stage
+
+### LLM & Credentials
+
+- **Key pool** for credential management with stable credential states
+- **Aden credentials storage adapter** and subscription-based LLM config activation endpoint
+- **Consolidated model config** with unified model catalog
+- **New providers** — Kimi, Hive, and Aden added to the model catalog
+- **Model switcher** UI with runtime model switching API
+- **LLM key validation endpoint** with agent errors surfaced via SSE
+- **BYOK modal** import fixes for subscription token detection
+
+### Frontend
+
+- **Home redesign** — new home, credentials, and org chart pages
+- **Colony chat** and **queen DM** pages
+- **Sidebar + header** components and global app layout/routing
+- **Model switcher, settings modal, template card**
+- **Prompt library** with search, category filtering, and UI polish
+- **Side panel** fixes and sub-agent pane light-mode support
+- **Flowchart** light-mode support and normalized settings modal sizing
+- **User profile settings** and UI enhancements
+- **Sync user profile** to global memory as `user-profile.md`; queen profile API transformation
+- Removed the old workspace GUI and its dependencies
+
+### Framework & Runtime
+
+- Architecture revamp: new runtime config, simplified agent loading, new infra for queen
+- Home hive directory structure refactor
+- Agent loading pipeline fixes, MCP registry pipeline stage fix
+- Session resume improvements: separate resume vs new-session flow for queen sessions, edge-case fix for message injection in resumed sessions
+- Strip internal tags from user-visible output
+- Colony event bus subscription fixes and shared event bus for parent visibility
+- Worker spawn and stop-worker fixes
+- Default log level and extra logging hooks
+
+---
+
+## 🐛 Bug Fixes
+
+- **Ask-user widget** — fallback when widget fails to mount
+- **Skill loading** for colonies and proper skill resolution across queen sessions
+- **Model switching** and new-chat flow no longer carry stale state
+- **Tool pill mapping** preserved across turn boundary for deferred `ask_user` completions
+- **Tool limit** removed (was capping legitimate long tool lists)
+- **Queen loading** stability fixes
+- **Side panel** rendering issues
+- **Deprecated graphs** removed from UI
+- **Home-page prompts** now reach the queen directly without waiting for the greeting to finish
+- **Colony creation** link, reframing, and post-creation refresh
+- **Build error** in colony creation path
+- **GCU system prompt** tuning
+- **Tool credential filter** correctness
+- **Screenshot** capture and browser click reliability
+- **Queen message injection** when resuming a session
+- **Internal-tag diction** fixes in surfaced output
+- **MCP tool initialization** on cold start
+- **Frontend DM** edge cases
+- **Prompt library** new-session handling for new chat
+- **Config validation** and unavailable Minimax model handling
+- **Queen identity** loading on cold boot
+- **Extra text** in queen selector JSON response parsed safely
+- **Outdated queen communication prompt** removed
+
+---
+
+## 🧹 Refactor & Cleanup
+
+- **Shatter the Eld\*n ring** — top-to-bottom refactor of the runtime core
+- **Grand clean-up** of deprecated code paths
+- **Remove deprecated shims** and old session-status tools
+- **Big test cleanup** — integration tests and component tests rewritten around the new architecture
+- **Update references** for orchestrator / host / loader renames
+- **Consolidate tests** for queen state machine and verified outcomes
+- **Remove old workspace GUI** and its dependencies
+- **Remove old "new agent" button** and deprecated entry points
+- **Home hive directory** structure refactor
+
+---
+
+## ⚠️ Breaking Changes
+
+- **Old agents are not compatible.** Custom agents authored against the pre-v0.10.0 framework will need to be re-authored against the new Queen/Colony runtime.
+- **Session format** — pre-v0.10.0 sessions cannot be resumed.
+- **Deprecated tools removed** and Hive tool names have been realigned; any external scripts referencing old tool names must be updated.
+- **Old session-status tools** removed in favor of the new queen lifecycle tools.
+- **Workspace GUI removed** — the legacy workspace UI is gone; use the new home, colony chat, and queen DM pages.
+- **MCP registry pipeline** — MCP configurations now load through the new registry; custom MCP setups may need to be re-registered.
+
+---
+
+## 🚀 Upgrading
+
+Because this release rewrites the agent runtime, the recommended upgrade path is:
+
+1. Back up `~/.hive/` if you have sessions or custom agents you want to reference.
+2. Pull `main` at the v0.10.0 tag.
+3. Let Hive initialize the new queen profiles under `~/.hive/agents/queens/`.
+4. Re-create any custom agents as colonies/queens against the new framework.
+5. Re-register any custom MCP servers through the new MCP registry.
+
+Welcome to the Colony. 🐝
@@ -255,9 +255,10 @@ def register_tools(mcp: FastMCP) -> None:
            # Clean up whitespace
            text = " ".join(text.split())

-            # Truncate if needed
+            # Truncate if needed (reserve 3 chars for the ellipsis so the
+            # final string stays within max_length)
            if len(text) > max_length:
-                text = text[:max_length] + "..."
+                text = text[: max_length - 3] + "..."

            result: dict[str, Any] = {
                "url": url,
@@ -113,6 +113,24 @@ class TestWebScrapeTool:
        assert isinstance(result, dict)
        assert "error" not in result

+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_truncation_respects_max_length(self, mock_pw, mock_stealth, web_scrape_fn):
+        """Truncated content (including the ellipsis) must not exceed max_length."""
+        # max_length is clamped to >=1000, so build content larger than that
+        long_text = "a" * 5000
+        html = f"<html><body>{long_text}</body></html>"
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com", max_length=1000)
+        assert "error" not in result
+        assert len(result["content"]) <= 1000
+        assert result["content"].endswith("...")
+        assert result["length"] == len(result["content"])
+
    @pytest.mark.asyncio
    @patch(_STEALTH_PATH)
    @patch(_PW_PATH)
Author	SHA1	Message	Date
Richard Tang	22df99ef51	Merge remote-tracking branch 'origin/main' Release / Create Release (push) Waiting to run Details	2026-04-14 19:56:33 -07:00
Richard Tang	edc3135797	Merge branch 'feature/new-colony'	2026-04-14 19:56:08 -07:00
Richard Tang	27b15789fb	fix: skills prompts	2026-04-14 18:51:14 -07:00
RichardTang-Aden	5ba5933edc	Merge pull request #7046 from vincentjiang777/main docs: new readme	2026-04-14 18:02:49 -07:00
Timothy	50eb4b0e8f	Merge branch 'feature/colony-creation' into feature/new-colony	2026-04-14 16:34:30 -07:00
Richard Tang	3e4a4c9924	Merge remote-tracking branch 'origin/feat/text-only-tool-filter' into feature/new-colony	2026-04-14 16:29:19 -07:00
Richard Tang	c47987e73c	fix: ask user widget fallback	2026-04-14 16:27:12 -07:00
Richard Tang	8f5daf0569	fix: swtiching model and new chat	2026-04-14 16:04:07 -07:00
bryan	af5c72e785	feat: hide image-producing tools and vision-only prompt blocks from text-only models	2026-04-14 12:50:44 -07:00
bryan	5cdc01cb8c	fix: preserve tool pill mapping across turn boundary for deferred ask_user completions	2026-04-14 10:56:38 -07:00
Hundao	2f58cce781	fix(tools): web_scrape truncation no longer exceeds max_length (#7044 ) The previous code did `text[:max_length] + "..."`, which made the returned content always 3 chars longer than the requested max_length. Reserve room for the ellipsis inside the limit so the contract holds. Fixes #2098	2026-04-14 14:24:42 +08:00
vincentjiang777	9dc214cfd2	Merge branch 'aden-hive:main' into main	2026-04-10 20:35:42 -07:00