fix: context health and eviction

Merge remote-tracking branch 'origin/main'
Merge branch 'feature/new-colony'
2026-04-15 11:40:45 -07:00 · 2026-04-14 19:56:33 -07:00 · 2026-04-14 19:56:08 -07:00 · 2026-04-14 18:51:14 -07:00 · 2026-04-14 18:02:49 -07:00 · 2026-04-14 16:34:30 -07:00
40 changed files with 2331 additions and 387 deletions
@@ -39,7 +39,12 @@
      "Bash(bun run:*)",
      "Bash(npx eslint:*)",
      "Bash(npm run:*)",
-      "Bash(npm test:*)"
+      "Bash(npm test:*)",
+      "Bash(grep -n \"PIL\\\\|Image\\\\|to_thread\\\\|run_in_executor\" /home/timothy/aden/hive/tools/src/gcu/browser/*.py /home/timothy/aden/hive/tools/src/gcu/browser/tools/*.py)",
+      "WebFetch(domain:docs.litellm.ai)",
+      "Bash(cat /home/timothy/aden/hive/.venv/lib/python3.11/site-packages/litellm-*.dist-info/METADATA)",
+      "Bash(find \"/home/timothy/.hive/agents/queens/queen_brand_design/sessions/session_20260415_100751_d49f4c28/\" -type f -name \"*.json*\" -exec grep -l \"协日\" {} \\\\;)",
+      "Bash(grep -v ':0$')"
    ],
    "additionalDirectories": [
      "/home/timothy/.hive/skills/writing-hive-skills",
@@ -87,7 +87,7 @@ from framework.agent_loop.internals.types import (
 )
 from framework.agent_loop.types import AgentContext, AgentProtocol, AgentResult
 from framework.host.event_bus import EventBus
-from framework.llm.capabilities import supports_image_tool_results
+from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results
 from framework.llm.provider import Tool, ToolResult, ToolUse
 from framework.llm.stream_events import (
    FinishEvent,
@@ -632,13 +632,20 @@ class AgentLoop(AgentProtocol):
        if isinstance(stream_id, str) and stream_id.startswith("worker:"):
            tools.append(build_report_to_parent_tool())

+        # Hide image-producing tools from text-only models so they never try
+        # to call them. Avoids wasted turns + "screenshot failed" lessons
+        # getting saved to memory. See framework.llm.capabilities.
+        _llm_model = ctx.llm.model if ctx.llm else ""
+        tools, _hidden_image_tools = filter_tools_for_model(tools, _llm_model)
+
        logger.info(
-            "[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s",
+            "[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s | hidden_image_tools=%s",
            node_id,
            len(tools),
            [t.name for t in tools],
            ctx.supports_direct_user_io,
            type(self._judge).__name__ if self._judge else "None",
+            _hidden_image_tools,
        )

        # 4. Publish loop started
@@ -2722,9 +2729,18 @@ class AgentLoop(AgentProtocol):
            real_tool_results: list[dict] = []
            limit_hit = False
            executed_in_batch = 0
-            hard_limit = int(
-                self._config.max_tool_calls_per_turn * (1 + self._config.tool_call_overflow_margin)
-            )
+            # hard_limit <= 0 disables the per-turn cap entirely. Some
+            # models routinely emit 50+ tool calls per turn during wide
+            # fan-out scenarios (browser exploration, bulk code reads);
+            # capping them strands work mid-turn and the next turn just
+            # re-emits the discarded calls, which is strictly worse.
+            if self._config.max_tool_calls_per_turn > 0:
+                hard_limit = int(
+                    self._config.max_tool_calls_per_turn
+                    * (1 + self._config.tool_call_overflow_margin)
+                )
+            else:
+                hard_limit = 0  # disabled

            # Phase 1: triage — handle framework tools immediately,
            # queue real tools for parallel execution.
@@ -2736,7 +2752,7 @@ class AgentLoop(AgentProtocol):

            for tc in tool_calls:
                tool_call_count += 1
-                if tool_call_count > hard_limit:
+                if hard_limit > 0 and tool_call_count > hard_limit:
                    limit_hit = True
                    break
                executed_in_batch += 1
@@ -3052,6 +3068,33 @@ class AgentLoop(AgentProtocol):
                    str, tuple[ToolResult | BaseException, str, float] | BaseException
                ] = {}

+                async def _cancel_turn_with_stubs(
+                    _pending: list[ToolCallEvent] = pending_real,  # noqa: B006,B008
+                ) -> None:
+                    """Populate [Tool call cancelled by user] stubs for
+                    every pending tool so the conversation doesn't end
+                    up with dangling tool_use blocks, then raise
+                    TurnCancelled so the queen event loop continues
+                    cleanly. Shared between the parallel and serial
+                    phases because either can observe CancelledError.
+                    """
+                    for _tc in _pending:
+                        await conversation.add_tool_result(
+                            tool_use_id=_tc.tool_use_id,
+                            content="[Tool call cancelled by user]",
+                            is_error=True,
+                        )
+                        await self._publish_tool_completed(
+                            stream_id,
+                            node_id,
+                            _tc.tool_use_id,
+                            _tc.tool_name,
+                            "[Tool call cancelled by user]",
+                            is_error=True,
+                            execution_id=execution_id,
+                        )
+                    raise TurnCancelled() from None
+
                # Phase 2b: resolve the concurrency-safe batch. Prefer
                # any early task already started during streaming (Gap
                # 1) so we don't accidentally execute the same tool
@@ -3073,11 +3116,22 @@ class AgentLoop(AgentProtocol):
                    finally:
                        self._tool_task = None
                    # gather(return_exceptions=True) captures CancelledError
-                    # as a return value instead of propagating it.  Re-raise
-                    # so stop_worker actually stops the execution.
+                    # as a return value instead of propagating it.
+                    # Distinguish cancel_current_turn() (cancels only
+                    # _tool_task) from stop_worker (cancels the parent
+                    # execution task). When the parent itself is
+                    # cancelled, cancelling() > 0 — propagate so the
+                    # executor can save state. Otherwise convert to
+                    # TurnCancelled so the queen event loop continues,
+                    # writing cancellation stubs for every pending tool
+                    # first so the conversation has no dangling
+                    # tool_use blocks.
                    for entry in parallel_timed:
                        if isinstance(entry, asyncio.CancelledError):
-                            raise entry
+                            task = asyncio.current_task()
+                            if task and task.cancelling() > 0:
+                                raise entry
+                            await _cancel_turn_with_stubs()
                    for tc, entry in zip(parallel_batch, parallel_timed, strict=True):
                        timed_results_by_id[tc.tool_use_id] = entry

@@ -3087,6 +3141,8 @@ class AgentLoop(AgentProtocol):
                # drop. A ToolResult with is_error=True is a normal return
                # (e.g. "file not found") and does NOT trip the cascade -
                # the model should see subsequent errors too.
+                # CancelledError is handled separately via the shared
+                # user-cancel helper above.
                _serial_cascade_broken = False
                for tc in serial_batch:
                    if _serial_cascade_broken:
@@ -3113,12 +3169,13 @@ class AgentLoop(AgentProtocol):

                    timed_results_by_id[tc.tool_use_id] = entry
                    raw_check = entry[0] if isinstance(entry, tuple) else entry
-                    if isinstance(raw_check, BaseException) and not isinstance(
-                        raw_check, asyncio.CancelledError
-                    ):
+                    if isinstance(raw_check, asyncio.CancelledError):
+                        task = asyncio.current_task()
+                        if task and task.cancelling() > 0:
+                            raise raw_check
+                        await _cancel_turn_with_stubs()
+                    elif isinstance(raw_check, BaseException):
                        _serial_cascade_broken = True
-                    elif isinstance(raw_check, asyncio.CancelledError):
-                        raise raw_check

                # Phase 2d: reassemble results in original call order so
                # the rest of the loop sees no difference from the
@@ -3139,7 +3196,9 @@ class AgentLoop(AgentProtocol):
                        result = _build_tool_error_result(tc, raw)
                    else:
                        result = raw
-                    results_by_id[tc.tool_use_id] = self._truncate_tool_result(result, tc.tool_name)
+                    results_by_id[tc.tool_use_id] = await self._truncate_tool_result(
+                        result, tc.tool_name
+                    )

            # Phase 3: record results into conversation in original order,
            # build logged/real lists, and publish completed events.
@@ -3274,6 +3333,24 @@ class AgentLoop(AgentProtocol):
                    False,
                )

+            # --- Image eviction: strip old screenshot image_content ---
+            # Screenshots from browser_screenshot are inlined as base64
+            # data URLs in message.image_content. Each screenshot costs
+            # ~250k tokens when the provider counts base64 as text
+            # (gemini, most non-Anthropic providers). Four screenshots
+            # in one conversation blew through gemini's 1M context in
+            # session_20260415_104727_5c4ed7ff and caused garbage
+            # output ("协日" as the final assistant text). We evict
+            # aggressively after every tool batch — independent of the
+            # char-based usage_ratio, which severely underestimates
+            # image cost (counts each image as ~2000 tokens vs the
+            # ~250k actually billed). Text metadata stays on the
+            # evicted messages so the agent can still reason about
+            # "I took a screenshot at step N".
+            _max_imgs = self._config.max_retained_screenshots
+            if _max_imgs >= 0:
+                await conversation.evict_old_images(keep_latest=_max_imgs)
+
            # --- Mid-turn pruning: prevent context blowup within a single turn ---
            if conversation.usage_ratio() >= 0.6:
                protect = max(2000, self._config.max_context_tokens // 12)
@@ -3598,7 +3675,7 @@ class AgentLoop(AgentProtocol):
            max_chars=max_chars,
        )

-    def _truncate_tool_result(
+    async def _truncate_tool_result(
        self,
        result: ToolResult,
        tool_name: str,
@@ -3614,8 +3691,33 @@ class AgentLoop(AgentProtocol):
        - Large results (> limit): preview + file reference
        - Errors: pass through unchanged
        - read_file results: truncate with pagination hint (no re-spill)
+
+        For large results this does a synchronous JSON round-trip
+        (``json.loads`` + pretty-print ``json.dumps(indent=2)``) plus a
+        file write. On big payloads — web_search, web_fetch, full-page
+        extractions — this can block the event loop for hundreds of ms
+        per call. We offload to a worker thread so concurrent tool
+        executions keep running while one large result is being
+        pretty-printed and spilled to disk.
        """
-        return truncate_tool_result(
+        # Fast path: small results don't need thread offload. The
+        # function only touches disk / does heavy JSON work when the
+        # result exceeds either the truncation or spillover threshold,
+        # so cheap pass-throughs stay on the main loop.
+        needs_offload = (
+            len(result.content) > 10_000
+            and not result.is_error
+        )
+        if not needs_offload:
+            return truncate_tool_result(
+                result=result,
+                tool_name=tool_name,
+                max_tool_result_chars=self._config.max_tool_result_chars,
+                spillover_dir=self._config.spillover_dir,
+                next_spill_filename_fn=self._next_spill_filename,
+            )
+        return await asyncio.to_thread(
+            truncate_tool_result,
            result=result,
            tool_name=tool_name,
            max_tool_result_chars=self._config.max_tool_result_chars,
@@ -162,10 +162,17 @@ def update_run_cursor(
 def _extract_spillover_filename(content: str) -> str | None:
    """Extract spillover filename from a tool result annotation.

-    Matches patterns produced by EventLoopNode._truncate_tool_result():
-        - Large result:  "saved to 'web_search_1.txt'"
-        - Small result:  "[Saved to 'web_search_1.txt']"
+    Matches patterns produced by ``truncate_tool_result``:
+        - New large-result header: "Full result saved at: /abs/path/file.txt"
+        - Legacy bracketed trailer: "[Saved to 'file.txt']"  (pre-2026-04-15,
+          retained here so cold conversations still resolve)
    """
+    # New prose format — ``saved at: <absolute path>``, terminated by
+    # newline or end-of-string.
+    match = re.search(r"[Ss]aved at:\s*(\S+)", content)
+    if match:
+        return match.group(1)
+    # Legacy format.
    match = re.search(r"[Ss]aved to '([^']+)'", content)
    return match.group(1) if match else None

@@ -878,12 +885,14 @@ class NodeConversation:

            if spillover:
                placeholder = (
-                    f"[Pruned tool result: {orig_len} chars. "
-                    f"Full data in '{spillover}'. "
-                    f"Use read_file('{spillover}') to retrieve.]"
+                    f"Pruned tool result ({orig_len:,} chars) cleared from context. "
+                    f"Full data saved at: {spillover}\n"
+                    f"Read the complete data with read_file(path='{spillover}')."
                )
            else:
-                placeholder = f"[Pruned tool result: {orig_len} chars cleared from context.]"
+                placeholder = (
+                    f"Pruned tool result ({orig_len:,} chars) cleared from context."
+                )

            self._messages[i] = Message(
                seq=msg.seq,
@@ -905,6 +914,81 @@ class NodeConversation:
        self._last_api_input_tokens = None
        return count

+    async def evict_old_images(self, keep_latest: int = 2) -> int:
+        """Strip ``image_content`` from older messages, keeping the most recent.
+
+        Screenshots from ``browser_screenshot`` are inlined into the
+        message's ``image_content`` as base64 data URLs. Each screenshot
+        costs ~250k tokens when the provider counts the base64 as
+        text — four screenshots push a conversation over gemini's 1M
+        context limit and trigger out-of-context garbage output (see
+        ``session_20260415_104727_5c4ed7ff`` for the terminal case
+        where the model emitted ``协日`` as its final text then stopped).
+
+        This method walks backward through messages and keeps
+        ``image_content`` intact on the most recent ``keep_latest``
+        messages that have images. Older messages get their
+        ``image_content`` nulled out — the text content (metadata
+        like url, dimensions, scale hints) stays, but the raw bytes
+        are dropped. Storage is updated too so cold-restore sees the
+        same evicted state.
+
+        Run this right after every tool result is recorded so image
+        context stays bounded even within a single iteration (the
+        compaction pipeline only fires at iteration boundaries, too
+        late for a single turn that takes 4 screenshots).
+
+        Returns the number of messages whose image_content was evicted.
+        """
+        if not self._messages or keep_latest < 0:
+            return 0
+
+        # Find messages carrying images, walking newest → oldest.
+        image_indices: list[int] = []
+        for i in range(len(self._messages) - 1, -1, -1):
+            if self._messages[i].image_content:
+                image_indices.append(i)
+
+        # Nothing to evict if we have ≤ keep_latest images total.
+        if len(image_indices) <= keep_latest:
+            return 0
+
+        # Evict everything past the first keep_latest (newest) entries.
+        to_evict = image_indices[keep_latest:]
+        evicted = 0
+        for idx in to_evict:
+            msg = self._messages[idx]
+            self._messages[idx] = Message(
+                seq=msg.seq,
+                role=msg.role,
+                content=msg.content,
+                tool_use_id=msg.tool_use_id,
+                tool_calls=msg.tool_calls,
+                is_error=msg.is_error,
+                phase_id=msg.phase_id,
+                is_transition_marker=msg.is_transition_marker,
+                is_client_input=msg.is_client_input,
+                image_content=None,  # ← dropped
+                is_skill_content=msg.is_skill_content,
+                run_id=msg.run_id,
+            )
+            evicted += 1
+            if self._store:
+                await self._store.write_part(
+                    msg.seq, self._messages[idx].to_storage_dict()
+                )
+
+        if evicted:
+            # Reset token estimate — image blocks no longer contribute.
+            self._last_api_input_tokens = None
+            logger.info(
+                "evict_old_images: dropped image_content from %d message(s), "
+                "kept %d most recent",
+                evicted,
+                keep_latest,
+            )
+        return evicted
+
    async def compact(
        self,
        summary: str,
@@ -1165,16 +1249,18 @@ class NodeConversation:
            # Nothing to save — skip file creation
            conv_filename = ""

-        # Build reference message
+        # Build reference message. Prose format (no brackets) — see the
+        # poison-pattern note on truncate_tool_result. Frontier models
+        # autocomplete `[...']` trailers into their own text turns.
        ref_parts: list[str] = []
        if conv_filename:
            full_path = str((spill_path / conv_filename).resolve())
            ref_parts.append(
-                f"[Previous conversation saved to '{full_path}'. "
-                f"Use read_file('{conv_filename}') to review if needed.]"
+                f"Previous conversation saved at: {full_path}\n"
+                f"Read the full transcript with read_file('{conv_filename}')."
            )
        elif not collapsed_msgs:
-            ref_parts.append("[Previous freeform messages compacted.]")
+            ref_parts.append("(Previous freeform messages compacted.)")

        # Aggressive: add collapsed tool-call history to the reference
        if collapsed_msgs:
@@ -102,12 +102,14 @@ def microcompact(
        orig_len = len(msg.content)
        if spillover:
            placeholder = (
-                f"[Old tool result cleared: {orig_len} chars. "
-                f"Full data in '{spillover}'. "
-                f"Use read_file('{spillover}') to retrieve.]"
+                f"Old tool result ({orig_len:,} chars) cleared from context. "
+                f"Full data saved at: {spillover}\n"
+                f"Read the complete data with read_file(path='{spillover}')."
            )
        else:
-            placeholder = f"[Old tool result cleared: {orig_len} chars.]"
+            placeholder = (
+                f"Old tool result ({orig_len:,} chars) cleared from context."
+            )

        # Mutate in-place (microcompact is synchronous, no store writes)
        conversation._messages[i] = Message(
@@ -142,7 +144,14 @@ def _find_tool_name_for_result(messages: list[Message], tool_msg: Message) -> st


 def _extract_spillover_filename_inline(content: str) -> str | None:
-    """Quick inline check for spillover filename in tool result content."""
+    """Quick inline check for spillover filename in tool result content.
+
+    Matches both the new prose format ("saved at: /path") and the
+    legacy bracketed trailer ("saved to '/path'").
+    """
+    match = re.search(r"saved at:\s*(\S+)", content, re.IGNORECASE)
+    if match:
+        return match.group(1)
    match = re.search(r"saved to '([^']+)'", content, re.IGNORECASE)
    return match.group(1) if match else None

@@ -215,14 +215,30 @@ def truncate_tool_result(
    """Persist tool result to file and optionally truncate for context.

    When *spillover_dir* is configured, EVERY non-error tool result is
-    saved to a file (short filename like ``web_search_1.txt``).  A
-    ``[Saved to '...']`` annotation is appended so the reference
-    survives pruning and compaction.
+    written to disk for debugging. The LLM-visible content is then
+    shaped to avoid a **poison pattern** that we traced on 2026-04-15
+    through a gemini-3.1-pro-preview queen session: the prior format
+    appended ``\\n\\n[Saved to '/abs/path/file.txt']`` after every
+    small result, and frontier pattern-matching models (gemini 3.x in
+    particular) learned to autocomplete the `[Saved to '...']` trailer
+    in their own assistant turns, eventually degenerating into echoing
+    the whole tool result instead of deciding what to do next. See
+    ``session_20260415_100751_d49f4c28/conversations/parts/0000000056.json``
+    for the terminal case where the model's "text" output was the full
+    tool_result JSON.

-    - Small results (≤ limit): full content kept + file annotation
-    - Large results (> limit): preview + file reference
-    - Errors: pass through unchanged
-    - read_file results: truncate with pagination hint (no re-spill)
+    Rules after the fix:
+    - **Small results (≤ limit):** pass content through unchanged. No
+      trailer. No annotation. The full content is already in the
+      message; the disk copy is for debugging only.
+    - **Large results (> limit):** preview + file reference, but
+      formatted as plain prose instead of a bracketed ``[...]``
+      pattern. Structured JSON metadata ("_saved_to") is embedded
+      inside the JSON body when the preview is JSON-shaped so the
+      model can locate the full file without seeing a mimicry-prone
+      bracket token outside the body.
+    - **Errors:** pass through unchanged.
+    - **read_file results:** truncate with pagination hint (no re-spill).
    """
    limit = max_tool_result_chars

@@ -252,18 +268,20 @@ def truncate_tool_result(
        else:
            preview_block = result.content[:PREVIEW_CAP] + "…"

+        # Prose header (no brackets).
        header = (
-            f"[{tool_name} result: {len(result.content):,} chars — "
-            f"too large for context. Use offset_bytes/limit_bytes "
-            f"parameters to read smaller chunks.]"
+            f"Tool `{tool_name}` returned {len(result.content):,} characters "
+            f"(too large for context). Use offset_bytes / limit_bytes "
+            f"parameters to paginate smaller chunks."
        )
        if metadata_str:
            header += f"\n\nData structure:\n{metadata_str}"
        header += (
-            "\n\nWARNING: This is an INCOMPLETE preview. Do NOT draw conclusions or counts from it."
+            "\n\nWARNING: the preview below is a SAMPLE only — do NOT "
+            "draw counts, totals, or conclusions from it."
        )

-        truncated = f"{header}\n\nPreview (small sample only):\n{preview_block}"
+        truncated = f"{header}\n\nPreview (truncated):\n{preview_block}"
        logger.info(
            "%s result truncated: %d → %d chars (use offset/limit to paginate)",
            tool_name,
@@ -301,7 +319,10 @@ def truncate_tool_result(

        if limit > 0 and len(result.content) > limit:
            # Large result: build a small, metadata-rich preview so the
-            # LLM cannot mistake it for the complete dataset.
+            # LLM cannot mistake it for the complete dataset. The
+            # preview is introduced as plain prose (no bracketed
+            # ``[Result from …]`` token) so it doesn't prime the model
+            # to autocomplete the same pattern in its next turn.
            PREVIEW_CAP = 5000

            # Extract structural metadata (array lengths, key names)
@@ -316,21 +337,22 @@ def truncate_tool_result(
            else:
                preview_block = result.content[:PREVIEW_CAP] + "…"

-            # Assemble header with structural info + warning
+            # Prose header (no brackets). Absolute path still surfaced
+            # so the agent can read the full file, but it's framed as
+            # a sentence, not a bracketed trailer.
            header = (
-                f"[Result from {tool_name}: {len(result.content):,} chars — "
-                f"too large for context, saved to '{abs_path}'.]\n"
+                f"Tool `{tool_name}` returned {len(result.content):,} characters "
+                f"(too large for context). Full result saved at: {abs_path}\n"
+                f"Read the complete data with read_file(path='{abs_path}').\n"
            )
            if metadata_str:
-                header += f"\nData structure:\n{metadata_str}"
+                header += f"\nData structure:\n{metadata_str}\n"
            header += (
-                f"\n\nWARNING: The preview below is INCOMPLETE. "
-                f"Do NOT draw conclusions or counts from it. "
-                f"Use read_file(path='{abs_path}') to read the "
-                f"full data before analysis."
+                "\nWARNING: the preview below is a SAMPLE only — do NOT "
+                "draw counts, totals, or conclusions from it."
            )

-            content = f"{header}\n\nPreview (small sample only):\n{preview_block}"
+            content = f"{header}\n\nPreview (truncated):\n{preview_block}"
            logger.info(
                "Tool result spilled to file: %s (%d chars → %s)",
                tool_name,
@@ -338,10 +360,22 @@ def truncate_tool_result(
                abs_path,
            )
        else:
-            # Small result: keep full content + annotation with absolute path
-            content = f"{result.content}\n\n[Saved to '{abs_path}']"
+            # Small result: pass content through UNCHANGED.
+            #
+            # The prior design appended `\n\n[Saved to '/abs/path']`
+            # after every small result so the agent could re-read the
+            # file later. But (a) the full content is already in the
+            # message, so there's nothing to re-read; (b) the
+            # `[Saved to '…']` trailer is a repeating token pattern
+            # that frontier pattern-matching models autocomplete into
+            # their own assistant turns, eventually echoing whole tool
+            # results as "text" instead of making decisions. Dropping
+            # the trailer entirely kills the poison pattern. Spilled
+            # files on disk still exist for debugging — they just
+            # aren't advertised in the LLM-visible message.
+            content = result.content
            logger.info(
-                "Tool result saved to file: %s (%d chars → %s)",
+                "Tool result saved to file: %s (%d chars → %s, no trailer)",
                tool_name,
                len(result.content),
                filename,
@@ -373,15 +407,17 @@ def truncate_tool_result(
        else:
            preview_block = result.content[:PREVIEW_CAP] + "…"

+        # Prose header (no brackets) — see docstring for the poison
+        # pattern that the bracket format triggered.
        header = (
-            f"[Result from {tool_name}: {len(result.content):,} chars — "
-            f"truncated to fit context budget.]"
+            f"Tool `{tool_name}` returned {len(result.content):,} characters "
+            f"(truncated to fit context budget — no spillover dir configured)."
        )
        if metadata_str:
            header += f"\n\nData structure:\n{metadata_str}"
        header += (
-            "\n\nWARNING: This is an INCOMPLETE preview. "
-            "Do NOT draw conclusions or counts from the preview alone."
+            "\n\nWARNING: the preview below is a SAMPLE only — do NOT "
+            "draw counts, totals, or conclusions from it."
        )

        truncated = f"{header}\n\n{preview_block}"
@@ -2,6 +2,7 @@

 from __future__ import annotations

+import asyncio
 import json
 import logging
 import time
@@ -49,7 +50,13 @@ class LoopConfig:
    """Configuration for the event loop."""

    max_iterations: int = 50
-    max_tool_calls_per_turn: int = 30
+    # 0 (or any non-positive value) disables the per-turn hard limit,
+    # letting a single assistant turn fan out arbitrarily many tool
+    # calls. Models like Gemini 3.1 Pro routinely emit 40-80 tool
+    # calls in one turn during browser exploration; capping them
+    # strands work half-finished and makes the next turn repeat the
+    # discarded calls, which is worse than just running them.
+    max_tool_calls_per_turn: int = 0
    judge_every_n_turns: int = 1
    stall_detection_threshold: int = 3
    stall_similarity_threshold: float = 0.85
@@ -67,14 +74,33 @@ class LoopConfig:
    compaction_warning_buffer_tokens: int = 12_000
    store_prefix: str = ""

-    # Overflow margin for max_tool_calls_per_turn. Tool calls are only
-    # discarded when the count exceeds max_tool_calls_per_turn * (1 + margin).
+    # Overflow margin for max_tool_calls_per_turn. When the limit is
+    # enabled (>0), tool calls are only discarded when the count
+    # exceeds max_tool_calls_per_turn * (1 + margin). Ignored when
+    # max_tool_calls_per_turn is 0.
    tool_call_overflow_margin: float = 0.5

    # Tool result context management.
    max_tool_result_chars: int = 30_000
    spillover_dir: str | None = None

+    # Image retention in conversation history.
+    # Screenshots from ``browser_screenshot`` are inlined as base64
+    # data URLs inside message ``image_content``. Each full-page
+    # screenshot costs ~250k tokens when the provider counts the
+    # base64 as text (gemini, most non-Anthropic providers). Four
+    # screenshots in one conversation push gemini's 1M context over
+    # the limit and the model starts emitting garbage.
+    #
+    # The framework strips image_content from older messages after
+    # every tool-result batch, keeping only the most recent N
+    # screenshots. The text metadata on evicted messages (url, size,
+    # scale hints) is preserved so the agent can still reason about
+    # "I took a screenshot at step N that showed the compose modal".
+    # Raise this only if you genuinely need longer visual history AND
+    # you know your provider is using native image tokenization.
+    max_retained_screenshots: int = 2
+
    # set_output value spilling.
    max_output_value_chars: int = 2_000

@@ -158,7 +184,7 @@ class OutputAccumulator:

    async def set(self, key: str, value: Any) -> None:
        """Set a key-value pair, auto-spilling large values to files."""
-        value = self._auto_spill(key, value)
+        value = await self._auto_spill(key, value)
        self.values[key] = value
        if self.store:
            cursor = await self.store.read_cursor() or {}
@@ -167,41 +193,67 @@ class OutputAccumulator:
            cursor["outputs"] = outputs
            await self.store.write_cursor(cursor)

-    def _auto_spill(self, key: str, value: Any) -> Any:
-        """Save large values to a file and return a reference string."""
+    async def _auto_spill(self, key: str, value: Any) -> Any:
+        """Save large values to a file and return a reference string.
+
+        Runs the JSON serialization and file write on a worker thread
+        so they don't block the asyncio event loop. For a 100k-char
+        dict this used to freeze every concurrent tool call for ~50ms
+        of ``json.dumps(indent=2)`` + a sync disk write; for bigger
+        payloads or slow storage (NFS, networked FS) the freeze was
+        proportionally worse.
+        """
        if self.max_value_chars <= 0 or not self.spillover_dir:
            return value

-        val_str = json.dumps(value, ensure_ascii=False) if not isinstance(value, str) else value
-        if len(val_str) <= self.max_value_chars:
+        # Cheap size probe first — if the value is already a short
+        # string we can skip both the JSON round-trip and the thread
+        # hop entirely.
+        if isinstance(value, str) and len(value) <= self.max_value_chars:
            return value

-        spill_path = Path(self.spillover_dir)
-        spill_path.mkdir(parents=True, exist_ok=True)
-        ext = ".json" if isinstance(value, (dict, list)) else ".txt"
-        filename = f"output_{key}{ext}"
-        write_content = (
-            json.dumps(value, indent=2, ensure_ascii=False)
-            if isinstance(value, (dict, list))
-            else str(value)
-        )
-        file_path = spill_path / filename
-        file_path.write_text(write_content, encoding="utf-8")
-        file_size = file_path.stat().st_size
-        logger.info(
-            "set_output value auto-spilled: key=%s, %d chars -> %s (%d bytes)",
-            key,
-            len(val_str),
-            filename,
-            file_size,
-        )
-        # Use absolute path so parent agents can find files from subagents
-        abs_path = str(file_path.resolve())
-        return (
-            f"[Saved to '{abs_path}' ({file_size:,} bytes). "
-            f"Use read_file(path='{abs_path}') "
-            f"to access full data.]"
-        )
+        def _spill_sync() -> Any:
+            # JSON serialization for size check (only for non-strings).
+            if isinstance(value, str):
+                val_str = value
+            else:
+                val_str = json.dumps(value, ensure_ascii=False)
+            if len(val_str) <= self.max_value_chars:
+                return value
+
+            spill_path = Path(self.spillover_dir)
+            spill_path.mkdir(parents=True, exist_ok=True)
+            ext = ".json" if isinstance(value, (dict, list)) else ".txt"
+            filename = f"output_{key}{ext}"
+            write_content = (
+                json.dumps(value, indent=2, ensure_ascii=False)
+                if isinstance(value, (dict, list))
+                else str(value)
+            )
+            file_path = spill_path / filename
+            file_path.write_text(write_content, encoding="utf-8")
+            file_size = file_path.stat().st_size
+            logger.info(
+                "set_output value auto-spilled: key=%s, %d chars -> %s (%d bytes)",
+                key,
+                len(val_str),
+                filename,
+                file_size,
+            )
+            # Use absolute path so parent agents can find files from subagents.
+            #
+            # Prose format (no brackets) — same fix as tool_result_handler:
+            # frontier pattern-matching models autocomplete bracketed
+            # `[Saved to '...']` trailers into their own assistant turns,
+            # eventually degenerating into echoing the file path as text.
+            # Keep the path accessible but frame it as plain prose.
+            abs_path = str(file_path.resolve())
+            return (
+                f"Output saved at: {abs_path} ({file_size:,} bytes). "
+                f"Read the full data with read_file(path='{abs_path}')."
+            )
+
+        return await asyncio.to_thread(_spill_sync)

    def get(self, key: str) -> Any | None:
        return self.values.get(key)
@@ -37,6 +37,8 @@ def build_prompt_spec(
    narrative: str | None = None,
    memory_prompt: str | None = None,
 ) -> PromptSpec:
+    from framework.skills.tool_gating import augment_catalog_for_tools
+
    resolved_memory = memory_prompt
    if resolved_memory is None:
        resolved_memory = getattr(ctx, "memory_prompt", "") or ""
@@ -46,6 +48,17 @@ def build_prompt_spec(
                resolved_memory = dynamic() or ""
            except Exception:
                resolved_memory = getattr(ctx, "memory_prompt", "") or ""
+
+    # Tool-gated pre-activation: inject full body of default skills whose
+    # trigger tools are present in this agent's tool list (e.g. browser_*
+    # pulls in hive.browser-automation). Keeps non-browser agents lean.
+    tool_names = [
+        getattr(t, "name", "") for t in (getattr(ctx, "available_tools", None) or [])
+    ]
+    skills_catalog_prompt = augment_catalog_for_tools(
+        ctx.skills_catalog_prompt or "", tool_names
+    )
+
    return PromptSpec(
        identity_prompt=ctx.identity_prompt or "",
        focus_prompt=focus_prompt
@@ -53,7 +66,7 @@ def build_prompt_spec(
        else (ctx.agent_spec.system_prompt or ""),
        narrative=narrative if narrative is not None else (ctx.narrative or ""),
        accounts_prompt=ctx.accounts_prompt or "",
-        skills_catalog_prompt=ctx.skills_catalog_prompt or "",
+        skills_catalog_prompt=skills_catalog_prompt,
        protocols_prompt=ctx.protocols_prompt or "",
        memory_prompt=resolved_memory,
        agent_type=ctx.agent_spec.agent_type,
@@ -1,5 +1,6 @@
 """Node definitions for Queen agent."""

+import re
 from pathlib import Path

 from framework.orchestrator import NodeSpec
@@ -32,6 +33,29 @@ def _build_appendices() -> str:
    return parts


+# Wraps prompt sections that should only be shown to vision-capable models.
+# Content inside `<!-- vision-only -->...<!-- /vision-only -->` is kept for
+# vision models and stripped for text-only models. Applied once per session
+# in queen_orchestrator.create_queen.
+_VISION_ONLY_BLOCK_RE = re.compile(
+    r"<!-- vision-only -->(.*?)<!-- /vision-only -->",
+    re.DOTALL,
+)
+
+
+def finalize_queen_prompt(text: str, has_vision: bool) -> str:
+    """Resolve `<!-- vision-only -->` blocks based on model capability.
+
+    For vision-capable models the markers are stripped and the inner
+    content is kept. For text-only models the whole block (markers +
+    content) is removed so the queen is never nudged toward tools it
+    cannot usefully invoke.
+    """
+    if has_vision:
+        return _VISION_ONLY_BLOCK_RE.sub(r"\1", text)
+    return _VISION_ONLY_BLOCK_RE.sub("", text)
+
+
 # Shared appendices — appended to every coding node's system prompt.
 _appendices = _build_appendices()

@@ -504,7 +528,7 @@ The queen writes final production-ready system prompts directly.

 MCP servers are loaded from the global registry by name. Available servers:
 - `hive_tools` — web search, email, CRM, calendar, 100+ integrations
- `gcu-tools` — browser automation (click, type, navigate, screenshot)
+- `gcu-tools` — browser automation (click, type, navigate<!-- vision-only -->, screenshot<!-- /vision-only -->)
 - `files-tools` — file I/O (read, write, edit, search, list)

 **Template variables:** Add a `variables:` section at the top of agent.json \
@@ -695,29 +719,64 @@ a saved agent.

 ## Forking the session into a persistent colony

-**When to use create_colony:** the user needs work to run \
-**headless, recurring, or in parallel to this chat** — something \
-that keeps going after you stop talking. Typical triggers:
+**Prove the work inline BEFORE scaling to a colony.** This is the \
+most important rule in this section. A colony is a durable, \
+unattended runtime — you must know the task mechanics work before \
+you bake them into one. The expensive, hard-to-debug failures \
+(dummy-target browser loops, wrong selectors, misread skills) \
+happen when a queen delegates to a colony without ever doing \
+the work herself first.
+
+**The inline-first, scale-after pattern:**
+
+  1. **Do one instance of the work yourself, inline**, right in \
+     this chat. Use your own tools. Open the browser, click the \
+     real button, type the real text, send the real message, \
+     verify the real result. This is the shortest path from \
+     "vague intent" to "known-working procedure" — you learn \
+     the exact selectors, the exact quirks, the exact sequence \
+     that works on this site / API / system right now.
+
+  2. **Report the result to the user.** "I sent the message to \
+     Dimitris — here's the confirmation. Before I scale this to \
+     your whole connection list, want me to tweak anything?" \
+     This gives the user a concrete sample to react to AND \
+     gives you feedback before the cost of scaling multiplies.
+
+  3. **Only after a successful inline run**, decide whether to:
+     - stay inline and iterate by hand (small batches)
+     - fan out via `run_parallel_workers` (one-shot batch, \
+       results needed RIGHT NOW, no persistence needed)
+     - scale via `create_colony` (headless / recurring / needs \
+       to survive this chat ending)
+
+**When to use create_colony:** after step 2 has succeeded, and \
+the user needs work to run **headless, recurring, or in parallel \
+to this chat**. Typical triggers:
  - "run this every morning / every hour / on a cron"
  - "keep monitoring X and alert me when Y"
  - "fire this off in the background, I'll check on it later"
  - "spin up a dedicated agent for this so I can keep working here"
  - any task that should survive the current conversation ending

-**When NOT to use it:** if the user just wants results RIGHT NOW \
-in this chat, use `run_parallel_workers` instead. If they want to \
-iterate on an agent design, stay in the planning/building flow. \
-Don't create a colony just because you "learned something \
-reusable" — the trigger is operational (needs to keep running), \
-not epistemic (knowledge worth saving).
+**When NOT to use it:**
+  - You haven't actually done the work once yet. STOP. Do it \
+    inline first. Delegating an untested procedure to a colony \
+    is the single most common cause of silent worker failure.
+  - The user wants results RIGHT NOW and doesn't need the task \
+    to persist → stay inline or use `run_parallel_workers`.
+  - You "learned something reusable" but there's no operational \
+    need to keep running — knowledge worth saving goes in a \
+    skill file, not a colony.

-**Two-step flow:**
+**Two-step flow (assuming step 1-2 above have succeeded):**
  1. AUTHOR A SKILL FIRST so the colony worker has the operational \
-     context it needs to run unattended. Use write_file to create a \
-     skill folder (recommended location: \
-     `~/.hive/skills/{skill-name}/SKILL.md`) capturing the \
-     procedure — API endpoints, auth flow, response shapes, \
-     gotchas, conventions, query patterns, rate limits. The \
+     context it needs to run unattended — and write it from the \
+     knowledge you just earned doing the work inline, not from \
+     speculation. Include the EXACT selectors, tool call \
+     sequences, and gotchas you hit in your own run. Use \
+     write_file to create the skill folder (recommended \
+     location: `~/.hive/skills/{skill-name}/SKILL.md`). The \
     SKILL.md needs YAML frontmatter with `name` (matching the \
     directory name) and `description` (1-1024 chars including \
     trigger keywords), followed by a markdown body. Optional \
@@ -726,12 +785,13 @@ not epistemic (knowledge worth saving).
  2. create_colony(colony_name, task, skill_path) — Validates the \
     skill folder, installs it under ~/.hive/skills/ if it isn't \
     already there, and forks this session into a new colony. \
-     NOTHING RUNS after this call: the task is baked into \
-     worker.json and the user starts the worker (or wires up a \
-     trigger) later from the new colony page. The task string \
-     must be FULL and self-contained — when the worker eventually \
-     runs it has zero memory of your chat. The skill you wrote is \
-     discovered on first scan so the worker starts informed.
+     The colony worker inherits your full conversation at spawn \
+     time, so it sees everything you already did and said — no \
+     repeated discovery. NOTHING RUNS immediately after this \
+     call: the task is baked into worker.json and the user starts \
+     the worker (or wires up a trigger) later from the new colony \
+     page. The task string still must be FULL and self-contained \
+     because triggers fire without your chat context.

 ## Workflow summary
 1. Understand requirements → discover tools → design the layout
@@ -826,7 +886,7 @@ search_files, run_command, undo_changes

 ## Browser Automation (gcu-tools MCP)
 All browser tools are prefixed with `browser_` (browser_start, browser_navigate, \
-browser_click, browser_fill, browser_snapshot, browser_screenshot, browser_scroll, \
+browser_click, browser_fill, browser_snapshot, <!-- vision-only -->browser_screenshot, <!-- /vision-only -->browser_scroll, \
 browser_tabs, browser_close, browser_evaluate, etc.).
 Follow the browser-automation skill protocol — activate it before using browser tools.

@@ -843,32 +903,62 @@ synthesis.

 ## Forking this session into a persistent colony

-**When to use create_colony:** the user needs work to run \
-**headless, recurring, or in parallel to this chat** — something \
-that should keep going after this conversation ends. Typical \
-triggers:
+**Prove the work inline BEFORE scaling to a colony.** This is the \
+most important rule in this section. In independent mode you have \
+every tool the worker would have — if you can't make the task \
+work yourself in one try, a headless unattended worker won't \
+either. The expensive, hard-to-debug failures (dummy-target \
+browser loops, wrong selectors, misread skills) happen when a \
+queen delegates to a colony without ever doing the work herself \
+first.
+
+**The inline-first, scale-after pattern:**
+
+  1. **Do one instance of the work yourself, inline**, right in \
+     this chat. Open the browser, click the real button, type \
+     the real text, send the real message, verify the real \
+     result. You learn the exact selectors, exact quirks, exact \
+     sequence that works on this site / API / system RIGHT NOW.
+  2. **Report the result to the user.** Show them the concrete \
+     sample. Ask if they want anything adjusted before you \
+     scale up.
+  3. **Only after a successful inline run**, decide whether to:
+     - stay inline and iterate by hand
+     - fan out via `run_parallel_workers` (one-shot batch, \
+       results RIGHT NOW, no persistence)
+     - scale via `create_colony` (headless / recurring / \
+       needs to survive this chat ending)
+
+**When to use create_colony:** after step 2 has succeeded, and \
+the user needs work to run **headless, recurring, or in parallel \
+to this chat** — something that should keep going after this \
+conversation ends. Typical triggers:
  - "run this every morning / every hour / on a cron"
  - "keep monitoring X and alert me when Y changes"
  - "fire this off in the background so I can keep working here"
  - "spin up a dedicated agent for this job"
  - any task that needs to survive the current session

-**When NOT to use it:** if the user just wants results RIGHT NOW \
-in this chat, use `run_parallel_workers` instead. Don't create a \
-colony just because you "learned something reusable" — the \
-trigger is operational (needs to keep running), not epistemic \
-(knowledge worth saving).
+**When NOT to use it:**
+  - You haven't actually done the work once yet. STOP. Do it \
+    inline first. This is the #1 cause of silent worker failure.
+  - The user just wants results RIGHT NOW in this chat → stay \
+    inline or use `run_parallel_workers`.
+  - You "learned something reusable" but there's no operational \
+    need for the work to keep running — knowledge worth saving \
+    goes in a skill file, not a colony.

-**Two-step flow:**
+**Two-step flow (assuming step 1-2 above have succeeded):**
  1. AUTHOR A SKILL FIRST in a SCRATCH location so the colony \
     worker has the operational context it needs to run \
-     unattended. Use write_file to create a skill folder \
+     unattended — and write it from the knowledge you just \
+     earned doing the work inline, not from speculation. Include \
+     the EXACT selectors, tool call sequences, and gotchas you \
+     hit in your own run. Use write_file to create a skill folder \
     somewhere temporary (e.g. `/tmp/{skill-name}/` or your \
-     working directory) capturing the procedure — API endpoints, \
-     auth flow, pagination, gotchas, rate limits, response \
-     shapes. DO NOT author it under `~/.hive/skills/` — that path \
-     is user-global and would leak the skill to every other \
-     agent. The SKILL.md needs YAML frontmatter with `name` \
+     working directory). DO NOT author it under `~/.hive/skills/` \
+     — that path is user-global and would leak the skill to every \
+     other agent. The SKILL.md needs YAML frontmatter with `name` \
     (matching the directory name) and `description` (1-1024 \
     chars including trigger keywords), followed by a markdown \
     body. Optional subdirs: scripts/, references/, assets/. \
@@ -878,12 +968,14 @@ trigger is operational (needs to keep running), not epistemic \
     the skill folder, forks this session into a new colony, and \
     installs the skill COLONY-SCOPED at \
     `~/.hive/colonies/{colony_name}/skills/{skill_name}/`. Only \
-     that colony's worker sees it, no other agent. NOTHING RUNS \
-     after this call — the task is baked into worker.json and \
-     the user starts the worker (or wires up a trigger) later \
-     from the new colony page. The task string must be FULL and \
-     self-contained because the worker has zero memory of your \
-     chat when it eventually runs.
+     that colony's worker sees it, no other agent. The colony \
+     worker inherits your full conversation at spawn time, so it \
+     sees everything you already did and said — no repeated \
+     discovery. NOTHING RUNS immediately after this call — the \
+     task is baked into worker.json and the user starts the \
+     worker (or wires up a trigger) later from the new colony \
+     page. The task string must still be FULL and self-contained \
+     because triggers fire without your chat context.
 """

 _queen_behavior_editing = """
@@ -899,33 +991,52 @@ Report the last run's results to the user and ask what they want to do next.
 """

 _queen_behavior_independent = """
-## Independent — do the work yourself
+## Independent — do the work yourself (inline first, always)

-You are the agent. No pre-loaded worker — you execute directly.
-1. Understand the task from the user
-2. Plan your approach briefly (no flowcharts or agent design)
-3. Execute using your tools: file I/O, shell commands, browser automation
-4. Report results, iterate if needed
+You are the agent. No pre-loaded worker — you execute directly. \
+**Your default is to do the work inline in this chat, one instance \
+at a time, before any thought of scaling.**

-## Scaling up from independent mode
+1. Understand the task from the user.
+2. Plan your approach briefly (no flowcharts, no agent design).
+3. **Do the work yourself, inline. One real instance.** Open the \
+   browser, call the real API, write to the real file, send the \
+   real message. Use your actual tools against real state. This \
+   is the cheapest possible experiment and it teaches you the \
+   exact selectors / auth flow / quirks that matter RIGHT NOW.
+4. **Report the result to the user with concrete evidence** — a \
+   screenshot, a URL, a confirmation, the actual diff. Let them \
+   react before you scale.
+5. Iterate if needed — STAY INLINE while you figure out the \
+   mechanics. Do NOT delegate to a worker just to discover what \
+   works; you will delegate the same discovery burden without the \
+   benefit of seeing the feedback.
+6. Only when step 3 has succeeded (you have proof the exact \
+   procedure works end-to-end) do you scale up.

-You have no pre-loaded worker in this phase, but you DO have two \
-lifecycle tools for spinning up work dynamically:
+**Scaling pathways** (in order of cost, cheapest first):
+- **Stay inline, run it again.** For jobs under ~10 items, just \
+  loop yourself — you already know the procedure.
+- **`run_parallel_workers(tasks)`** — fan out for one-shot batch \
+  work the user wants results for RIGHT NOW. No persistence, no \
+  colony. Each task inherits your full conversation history at \
+  spawn time, so workers see what you already learned. Use when \
+  you need concurrency to beat wall-clock time.
+- **`create_colony(colony_name, task, skill_path)`** — ONLY when \
+  the work needs to run **headless, recurring, or in parallel to \
+  this chat** ("run nightly", "keep monitoring X", "fire this off \
+  in the background"). Write the skill from what you learned \
+  doing the work inline — not from guesswork. Then fork. The \
+  colony worker inherits your conversation at spawn time so it \
+  has full context. Do NOT use this just because you "learned \
+  something reusable" — the trigger is operational (needs to \
+  keep running), not epistemic.

- **run_parallel_workers(tasks)** — for one-off batch work the user \
-  wants results for RIGHT NOW. Fan out N subtasks concurrently and \
-  synthesize the aggregated reports. No colony is created; the \
-  workers exist only for this call.
- **create_colony(colony_name, task, skill_path)** — when the user \
-  wants work to run **headless, recurring, or in parallel to this \
-  chat** (e.g. "run nightly", "keep monitoring X", "fire this off \
-  in the background"). Write a skill folder to scratch capturing \
-  the operational procedure, then call this to fork the session \
-  and install the skill colony-scoped. Nothing runs after fork — \
-  the user starts the worker (or sets a trigger) later from the \
-  new colony page. Do NOT use this just because you "learned \
-  something reusable" — the trigger is operational (needs to keep \
-  running), not epistemic.
+**Hard rule: NEVER call `run_parallel_workers` or `create_colony` \
+before you have successfully completed the task once inline.** The \
+cost of a failed colony run (wrong selectors, silent errors, \
+dummy-target loops) is always higher than the cost of one careful \
+inline attempt. When in doubt, do it yourself first.

 You do NOT have the agent-building lifecycle (no save_agent_draft, \
 confirm_and_build, load_built_agent, run_agent_with_input). If the \
@@ -21,7 +21,9 @@ All tools are prefixed with `browser_`:
 - `browser_click`, `browser_click_coordinate`, `browser_fill`, `browser_type` — interact
 - `browser_press` (with optional `modifiers=["ctrl"]` etc.) — keyboard shortcuts
 - `browser_snapshot` — compact accessibility-tree read (structured)
+<!-- vision-only -->
 - `browser_screenshot` — visual capture (annotated PNG)
+<!-- /vision-only -->
 - `browser_shadow_query`, `browser_get_rect` — locate elements (shadow-piercing via `>>>`)
 - `browser_coords` — convert image pixels to CSS pixels (always use `css_x/y`, never `physical_x/y`)
 - `browser_scroll`, `browser_wait` — navigation helpers
@@ -41,6 +41,42 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)


+def _format_spawn_task_message(task: str, input_data: dict[str, Any]) -> str:
+    """Render the spawn task into the worker's next user message.
+
+    Spawned workers inherit the queen's conversation via
+    ``ColonyRuntime._fork_parent_conversation``; this helper builds
+    the content of the trailing user message that carries the new
+    task. The queen's chat already provides the context for the
+    task, so we frame this as an explicit hand-off.
+
+    Additional keys from ``input_data`` (other than the task itself)
+    are rendered below the hand-off line so the worker sees them as
+    structured hand-off data. This mirrors the fresh-path
+    ``AgentLoop._build_initial_message`` shape so worker prompts look
+    roughly the same whether or not inheritance fired.
+    """
+    lines = [
+        "# New task delegated by the queen",
+        "",
+        "The queen's conversation up to this point is visible above. "
+        "Use it as context (who the user is, what was already decided, "
+        "which skills apply). Your own system prompt and tool set are "
+        "set by the framework — the queen's tools may differ from "
+        "yours, so treat her prior tool calls as history only.",
+        "",
+        f"task: {task}",
+    ]
+    for key, value in (input_data or {}).items():
+        if key in ("task", "user_request"):
+            # Already rendered above; don't duplicate.
+            continue
+        if value is None:
+            continue
+        lines.append(f"{key}: {value}")
+    return "\n".join(lines)
+
+
@dataclass
 class ColonyConfig:
    max_concurrent_workers: int = 100
@@ -432,6 +468,131 @@ class ColonyRuntime:
    def resume_timers(self) -> None:
        self._timers_paused = False

+    async def _fork_parent_conversation(
+        self,
+        dest_conv_dir: Path,
+        *,
+        task: str,
+        input_data: dict[str, Any] | None = None,
+    ) -> None:
+        """Fork the colony's parent queen conversation into ``dest_conv_dir``.
+
+        Copies the queen's ``parts/*.json`` and ``meta.json`` into the
+        worker's fresh conversation dir, then appends a synthetic user
+        message carrying the new task. The worker's subsequent
+        ``AgentLoop._restore`` reads this conversation via the usual
+        path — the queen's history is visible as prior turns, the task
+        appears as the most recent user message, and the worker starts
+        acting on it with full context.
+
+        This is a no-op if the colony runtime doesn't own a parent
+        queen conversation (e.g. a standalone colony started without a
+        queen wrapper).
+
+        Notes on filtering compatibility:
+          - Queen parts have ``phase_id=None``. When the worker's
+            restore applies its own phase filter, the backward-compat
+            fallback in NodeConversation.restore kicks in: an
+            all-None-phased store bypasses the filter. See
+            ``conversation.py:1369-1378``.
+          - ``cursor.json`` is deliberately NOT copied. The worker
+            should start fresh at iteration 0; copying the queen's
+            cursor would make the worker think it had already done
+            work.
+          - The queen's ``meta.json`` is copied but the AgentLoop
+            immediately rebuilds ``system_prompt`` from the worker's
+            own context post-restore (see agent_loop.py:533-535), so
+            the queen's system prompt does not leak into the worker.
+        """
+        # Resolve the queen's own conversation dir. For a queen-backed
+        # ColonyRuntime, storage_path points at the queen's session dir
+        # and conversations/ lives inside it. For standalone runtimes
+        # (tests, legacy fork path under ~/.hive/agents/{name}/worker/)
+        # there's no parent conversation — fall through to the fresh
+        # spawn path.
+        src_conv_dir = self._storage_path / "conversations"
+        src_parts_dir = src_conv_dir / "parts"
+        if not src_parts_dir.exists():
+            # No queen conversation to inherit — the worker starts with
+            # only the task, same as the pre-fork behavior. AgentLoop's
+            # fresh-conversation branch will call _build_initial_message
+            # and render input_data into the worker's first user message.
+            return
+
+        def _copy_and_append() -> None:
+            dest_parts = dest_conv_dir / "parts"
+            dest_parts.mkdir(parents=True, exist_ok=True)
+
+            # Copy each queen part. Use json.dumps round-trip (not raw
+            # file copy) so we can be defensive about unreadable files —
+            # a corrupted queen part file shouldn't take down the worker
+            # spawn, just drop that one part.
+            max_seq = -1
+            for part_file in sorted(src_parts_dir.glob("*.json")):
+                try:
+                    data = json.loads(part_file.read_text(encoding="utf-8"))
+                except (json.JSONDecodeError, OSError) as exc:
+                    logger.warning(
+                        "spawn fork: skipping unreadable queen part %s: %s",
+                        part_file.name,
+                        exc,
+                    )
+                    continue
+                seq = data.get("seq")
+                if isinstance(seq, int) and seq > max_seq:
+                    max_seq = seq
+                (dest_parts / part_file.name).write_text(
+                    json.dumps(data, ensure_ascii=False),
+                    encoding="utf-8",
+                )
+
+            # Copy the queen's meta.json so the worker's restore finds
+            # the conversation during its first run. The meta fields
+            # (system_prompt, max_context_tokens, etc.) get overridden
+            # by the worker's own AgentLoop config + context after
+            # restore, so nothing here bleeds into runtime behavior.
+            src_meta = src_conv_dir / "meta.json"
+            if src_meta.exists():
+                try:
+                    meta_data = json.loads(src_meta.read_text(encoding="utf-8"))
+                    (dest_conv_dir / "meta.json").write_text(
+                        json.dumps(meta_data, ensure_ascii=False),
+                        encoding="utf-8",
+                    )
+                except (json.JSONDecodeError, OSError) as exc:
+                    logger.warning(
+                        "spawn fork: failed to copy queen meta.json: %s", exc
+                    )
+
+            # Append the task as the next user message so the worker's
+            # LLM sees it as the most recent turn in the conversation
+            # after restore. This replaces the fresh-path call to
+            # _build_initial_message for spawned workers.
+            task_content = _format_spawn_task_message(task, input_data or {})
+            next_seq = max_seq + 1
+            task_part = {
+                "seq": next_seq,
+                "role": "user",
+                "content": task_content,
+                # phase_id omitted (None) so the backward-compat
+                # fallback in NodeConversation.restore keeps it visible
+                # to both queen-style and phase-filtered restores.
+                # run_id omitted so the worker's run_id filter (off by
+                # default since ctx.run_id is empty) doesn't reject it.
+            }
+            task_filename = f"{next_seq:010d}.json"
+            (dest_parts / task_filename).write_text(
+                json.dumps(task_part, ensure_ascii=False),
+                encoding="utf-8",
+            )
+            logger.info(
+                "spawn fork: inherited %d queen parts + appended task at seq %d",
+                max_seq + 1,
+                next_seq,
+            )
+
+        await asyncio.to_thread(_copy_and_append)
+
    # ── Worker Spawning ─────────────────────────────────────────

    async def spawn(
@@ -497,6 +658,22 @@ class ColonyRuntime:
            # (worse) the process CWD.
            worker_storage = self._storage_path / "workers" / worker_id
            worker_storage.mkdir(parents=True, exist_ok=True)
+
+            # Fork the queen's conversation into the worker's store.
+            # The queen already accumulated the user chat, read relevant
+            # skills, and made decisions about how to approach the task;
+            # the worker would repeat that discovery work (and often
+            # mis-step — see the 2026-04-14 "dummy-target" incident)
+            # if spawned with a blank store. We snapshot the queen's
+            # parts + meta at spawn time, then append the task as the
+            # next user message so the worker's AgentLoop restores into
+            # a conversation that already ends with its new instruction.
+            await self._fork_parent_conversation(
+                worker_storage / "conversations",
+                task=task,
+                input_data=input_data,
+            )
+
            worker_conv_store = FileConversationStore(
                worker_storage / "conversations"
            )
@@ -12,6 +12,11 @@ Vision support rules are derived from official vendor documentation:

 from __future__ import annotations

+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from framework.llm.provider import Tool
+

 def _model_name(model: str) -> str:
    """Return the bare model name after stripping any 'provider/' prefix."""
@@ -104,3 +109,22 @@ def supports_image_tool_results(model: str) -> bool:
    # 5. Default: assume vision capable
    #    Covers: OpenAI, Anthropic, Google, Mistral, Kimi, and other hosted providers
    return True
+
+
+def filter_tools_for_model(tools: list[Tool], model: str) -> tuple[list[Tool], list[str]]:
+    """Drop image-producing tools for text-only models.
+
+    Returns ``(filtered_tools, hidden_names)``. For vision-capable models
+    (or when *model* is empty) the input list is returned unchanged and
+    ``hidden_names`` is empty. For text-only models any tool with
+    ``produces_image=True`` is removed so the LLM never sees it in its
+    schema — avoids wasted calls and stale "screenshot failed" entries
+    in agent memory.
+    """
+    if not model or supports_image_tool_results(model):
+        return list(tools), []
+    hidden = [t.name for t in tools if t.produces_image]
+    if not hidden:
+        return list(tools), []
+    kept = [t for t in tools if not t.produces_image]
+    return kept, hidden
@@ -191,6 +191,14 @@ def _ensure_ollama_chat_prefix(model: str) -> str:
 RATE_LIMIT_MAX_RETRIES = 10
 RATE_LIMIT_BACKOFF_BASE = 2  # seconds
 RATE_LIMIT_MAX_DELAY = 120  # seconds - cap to prevent absurd waits
+# Separate, much lower cap for "empty response, finish_reason=stop"
+# scenarios. Unlike a real 429, these are rarely transient: Gemini
+# returns stop+empty on silently-filtered safety blocks, poisoned
+# conversation state (dangling tool_result after compaction), or
+# malformed tool schemas. Waiting minutes doesn't fix any of those, so
+# give up after 3 attempts (2+4+8 = 14s) and surface an actionable
+# error instead of burning 12+ minutes on exponential backoff.
+EMPTY_RESPONSE_MAX_RETRIES = 3
 MINIMAX_API_BASE = "https://api.minimax.io/v1"
 OPENROUTER_API_BASE = "https://openrouter.ai/api/v1"

@@ -872,22 +880,31 @@ class LiteLLMProvider(LLMProvider):
                        )
                        return response

-                    if attempt == retries:
+                    empty_cap = min(retries, EMPTY_RESPONSE_MAX_RETRIES)
+                    if attempt >= empty_cap:
                        logger.error(
-                            f"[retry] GAVE UP on {model} after {retries + 1} "
-                            f"attempts — empty response "
+                            f"[retry] GAVE UP on {model} after "
+                            f"{attempt + 1} attempts — empty response "
                            f"(finish_reason={finish_reason}, "
-                            f"choices={len(response.choices) if response.choices else 0})"
+                            f"choices={len(response.choices) if response.choices else 0}). "
+                            f"This is almost never a rate limit despite the "
+                            f"earlier log message — check the dumped request "
+                            f"at {dump_path} for poisoned conversation state "
+                            f"(dangling tool_result after compaction), a "
+                            f"safety-filter trigger in the prompt, or a "
+                            f"malformed tool schema."
                        )
                        return response
                    wait = _compute_retry_delay(attempt)
                    logger.warning(
                        f"[retry] {model} returned empty response "
                        f"(finish_reason={finish_reason}, "
-                        f"choices={len(response.choices) if response.choices else 0}) — "
-                        f"likely rate limited or quota exceeded. "
+                        f"choices={len(response.choices) if response.choices else 0}). "
                        f"Retrying in {wait}s "
-                        f"(attempt {attempt + 1}/{retries})"
+                        f"(attempt {attempt + 1}/{empty_cap}). "
+                        f"Note: empty-response retries are capped at "
+                        f"{EMPTY_RESPONSE_MAX_RETRIES} because this is rarely "
+                        f"a transient rate limit on small payloads."
                    )
                    time.sleep(wait)
                    continue
@@ -1097,22 +1114,35 @@ class LiteLLMProvider(LLMProvider):
                        )
                        return response

-                    if attempt == retries:
+                    # Use a much lower retry cap for empty-response
+                    # recoveries than for real exceptions. These are
+                    # almost never transient (see EMPTY_RESPONSE_MAX_RETRIES
+                    # rationale at the top of the file).
+                    empty_cap = min(retries, EMPTY_RESPONSE_MAX_RETRIES)
+                    if attempt >= empty_cap:
                        logger.error(
-                            f"[async-retry] GAVE UP on {model} after {retries + 1} "
-                            f"attempts — empty response "
+                            f"[async-retry] GAVE UP on {model} after "
+                            f"{attempt + 1} attempts — empty response "
                            f"(finish_reason={finish_reason}, "
-                            f"choices={len(response.choices) if response.choices else 0})"
+                            f"choices={len(response.choices) if response.choices else 0}). "
+                            f"This is almost never a rate limit despite the "
+                            f"earlier log message — check the dumped request "
+                            f"at {dump_path} for poisoned conversation state "
+                            f"(dangling tool_result after compaction), a "
+                            f"safety-filter trigger in the prompt, or a "
+                            f"malformed tool schema."
                        )
                        return response
                    wait = _compute_retry_delay(attempt)
                    logger.warning(
                        f"[async-retry] {model} returned empty response "
                        f"(finish_reason={finish_reason}, "
-                        f"choices={len(response.choices) if response.choices else 0}) — "
-                        f"likely rate limited or quota exceeded. "
+                        f"choices={len(response.choices) if response.choices else 0}). "
                        f"Retrying in {wait}s "
-                        f"(attempt {attempt + 1}/{retries})"
+                        f"(attempt {attempt + 1}/{empty_cap}). "
+                        f"Note: empty-response retries are capped at "
+                        f"{EMPTY_RESPONSE_MAX_RETRIES} because this is rarely "
+                        f"a transient rate limit on small payloads."
                    )
                    await asyncio.sleep(wait)
                    continue
@@ -27,6 +27,9 @@ class Tool:
    name: str
    description: str
    parameters: dict[str, Any] = field(default_factory=dict)
+    # If True, the tool may return ImageContent in its result. Text-only models
+    # (e.g. glm-5, deepseek-chat) have this hidden from their schema entirely.
+    produces_image: bool = False
    # If True, this tool performs no filesystem/process/network writes and is
    # safe to run concurrently with other safe-flagged tools inside the same
    # assistant turn. Unsafe tools (writes, shell, browser actions) are always
@@ -7,6 +7,7 @@ import inspect
 import json
 import logging
 import os
+import re
 from collections.abc import Callable
 from dataclasses import dataclass
 from pathlib import Path
@@ -18,6 +19,16 @@ logger = logging.getLogger(__name__)

 _INPUT_LOG_MAX_LEN = 500

+# Tools whose names match this pattern are assumed to return ImageContent.
+# Matched against the bare tool name (case-insensitive). Used to mark MCP
+# tools with produces_image=True so they can be filtered out for text-only
+# models before the schema is ever shown to the LLM (avoids wasted calls
+# and "screenshot failed" entries polluting memory).
+_IMAGE_TOOL_NAME_RE = re.compile(
+    r"(screenshot|screen_capture|capture_image|render_image|get_image|snapshot_image)",
+    re.IGNORECASE,
+)
+
 # Per-execution context overrides.  Each asyncio task (and thus each
 # concurrent graph execution) gets its own copy, so there are no races
 # when multiple ExecutionStreams run in parallel.
@@ -998,6 +1009,7 @@ class ToolRegistry:
                "properties": properties,
                "required": required,
            },
+            produces_image=bool(_IMAGE_TOOL_NAME_RE.search(mcp_tool.name or "")),
            concurrency_safe=mcp_tool.name in self.CONCURRENCY_SAFE_TOOLS,
        )

@@ -702,14 +702,27 @@ class Orchestrator:
        self.logger.info(f"   Goal: {goal.description}")
        self.logger.info(f"   Entry node: {graph.entry_node}")

-        # Set per-execution data_dir so data tools and spillover files
-        # share the same session-scoped directory.
+        # Set per-execution data_dir and agent_id so data tools and
+        # spillover files share the same session-scoped directory, and
+        # so MCP tools whose server-side schemas mark agent_id as a
+        # required field (list_dir, hashline_edit, replace_file_content,
+        # execute_command_tool, …) get a valid value injected even on
+        # registry instances where agent_loader.setup() didn't populate
+        # the session_context. Without this, FastMCP rejects those
+        # calls with "agent_id is a required property".
        _ctx_token = None
        if self._storage_path:
            from framework.loader.tool_registry import ToolRegistry

            _ctx_token = ToolRegistry.set_execution_context(
                data_dir=str(self._storage_path / "data"),
+                agent_id=graph.id,
+            )
+        else:
+            from framework.loader.tool_registry import ToolRegistry
+
+            _ctx_token = ToolRegistry.set_execution_context(
+                agent_id=graph.id,
            )

        try:
@@ -162,6 +162,8 @@ def build_prompt_spec_from_node_context(
    memory_prompt: str | None = None,
 ) -> NodePromptSpec:
    """Convert a NodeContext-like object into structured prompt inputs."""
+    from framework.skills.tool_gating import augment_catalog_for_tools
+
    resolved_memory_prompt = memory_prompt
    if resolved_memory_prompt is None:
        resolved_memory_prompt = getattr(ctx, "memory_prompt", "") or ""
@@ -171,6 +173,17 @@ def build_prompt_spec_from_node_context(
                resolved_memory_prompt = dynamic_memory_provider() or ""
            except Exception:
                resolved_memory_prompt = getattr(ctx, "memory_prompt", "") or ""
+
+    # Tool-gated pre-activation: inject full body of default skills whose
+    # trigger tools are present in this node's tool list (e.g. browser_*
+    # pulls in hive.browser-automation).
+    tool_names = [
+        getattr(t, "name", "") for t in (getattr(ctx, "available_tools", None) or [])
+    ]
+    skills_catalog_prompt = augment_catalog_for_tools(
+        ctx.skills_catalog_prompt or "", tool_names
+    )
+
    return NodePromptSpec(
        identity_prompt=ctx.identity_prompt or "",
        focus_prompt=focus_prompt
@@ -178,7 +191,7 @@ def build_prompt_spec_from_node_context(
        else (ctx.node_spec.system_prompt or ""),
        narrative=narrative if narrative is not None else (ctx.narrative or ""),
        accounts_prompt=ctx.accounts_prompt or "",
-        skills_catalog_prompt=ctx.skills_catalog_prompt or "",
+        skills_catalog_prompt=skills_catalog_prompt,
        protocols_prompt=ctx.protocols_prompt or "",
        memory_prompt=resolved_memory_prompt,
        node_type=ctx.node_spec.node_type,
@@ -311,7 +311,9 @@ async def create_queen(
        _queen_tools_running,
        _queen_tools_staging,
        _shared_building_knowledge,
+        finalize_queen_prompt,
    )
+    from framework.llm.capabilities import supports_image_tool_results
    from framework.host.event_bus import AgentEvent, EventType
    from framework.loader.mcp_registry import MCPRegistry
    from framework.loader.tool_registry import ToolRegistry
@@ -489,6 +491,13 @@ async def create_queen(
            "according to your current phase."
        )

+    # Resolve vision-only prompt sections based on the session's LLM.
+    # session.llm is immutable for the session's lifetime, so this check
+    # is stable — prompts never need to be recomposed mid-session.
+    _has_vision = bool(
+        session.llm and supports_image_tool_results(getattr(session.llm, "model", ""))
+    )
+
    _planning_body = (
        _queen_character_core
        + _queen_role_planning
@@ -500,7 +509,7 @@ async def create_queen(
        + _planning_knowledge
        + worker_identity
    )
-    phase_state.prompt_planning = _planning_body
+    phase_state.prompt_planning = finalize_queen_prompt(_planning_body, _has_vision)

    _building_body = (
        _queen_character_core
@@ -515,40 +524,52 @@ async def create_queen(
        + _appendices
        + worker_identity
    )
-    phase_state.prompt_building = _building_body
-    phase_state.prompt_staging = (
-        _queen_character_core
-        + _queen_role_staging
-        + _queen_style
-        + _queen_tools_staging
-        + _queen_behavior_always
-        + _queen_behavior_staging
-        + worker_identity
+    phase_state.prompt_building = finalize_queen_prompt(_building_body, _has_vision)
+    phase_state.prompt_staging = finalize_queen_prompt(
+        (
+            _queen_character_core
+            + _queen_role_staging
+            + _queen_style
+            + _queen_tools_staging
+            + _queen_behavior_always
+            + _queen_behavior_staging
+            + worker_identity
+        ),
+        _has_vision,
    )
-    phase_state.prompt_running = (
-        _queen_character_core
-        + _queen_role_running
-        + _queen_style
-        + _queen_tools_running
-        + _queen_behavior_always
-        + _queen_behavior_running
-        + worker_identity
+    phase_state.prompt_running = finalize_queen_prompt(
+        (
+            _queen_character_core
+            + _queen_role_running
+            + _queen_style
+            + _queen_tools_running
+            + _queen_behavior_always
+            + _queen_behavior_running
+            + worker_identity
+        ),
+        _has_vision,
    )
-    phase_state.prompt_editing = (
-        _queen_identity_editing
-        + _queen_style
-        + _queen_tools_editing
-        + _queen_behavior_always
-        + _queen_behavior_editing
-        + worker_identity
+    phase_state.prompt_editing = finalize_queen_prompt(
+        (
+            _queen_identity_editing
+            + _queen_style
+            + _queen_tools_editing
+            + _queen_behavior_always
+            + _queen_behavior_editing
+            + worker_identity
+        ),
+        _has_vision,
    )
-    phase_state.prompt_independent = (
-        _queen_character_core
-        + _queen_role_independent
-        + _queen_style
-        + _queen_tools_independent
-        + _queen_behavior_always
-        + _queen_behavior_independent
+    phase_state.prompt_independent = finalize_queen_prompt(
+        (
+            _queen_character_core
+            + _queen_role_independent
+            + _queen_style
+            + _queen_tools_independent
+            + _queen_behavior_always
+            + _queen_behavior_independent
+        ),
+        _has_vision,
    )

    # ---- Default skill protocols -------------------------------------
@@ -284,10 +284,16 @@ def _get_subscription_token(sub_id: str) -> str | None:
 def _hot_swap_sessions(
    request: web.Request, full_model: str, api_key: str | None, api_base: str | None
 ) -> int:
-    """Hot-swap the LLM on all running sessions. Returns count of swapped sessions."""
+    """Hot-swap the LLM on all running sessions. Returns count of swapped sessions.
+
+    Also refreshes the SessionManager's default model so that subsequent
+    one-shot LLM consumers (e.g. /messages/classify, new session bootstrap)
+    pick up the new provider/model instead of the stale startup override.
+    """
    from framework.server.session_manager import SessionManager

    manager: SessionManager = request.app["manager"]
+    manager._model = full_model
    swapped = 0
    for session in manager.list_sessions():
        llm_provider = getattr(session, "llm", None)
@@ -1,6 +1,6 @@
 ---
 name: hive.browser-automation
-description: Drive a real Chrome browser via the GCU Beeline extension + Chrome DevTools Protocol. Navigation, clicks, typing, screenshots, shadow-DOM sites (LinkedIn / Reddit / X), keyboard shortcuts, CSP gotchas, rich-text editors. Verified against real production sites 2026-04-11.
+description: Required before any browser_* tool call. Teaches the screenshot + browser_click_coordinate workflow that reaches shadow-DOM inputs selectors can't see, the CSS-pixel coordinate rule (not physical px), rich-text editor quirks ("send button stays disabled" failures), and CSP gotchas. Covers Chrome via CDP through the GCU Beeline extension. Skipping this causes repeated failures on LinkedIn / Reddit / X. Verified against real production sites 2026-04-11.
 metadata:
  author: hive
  type: default-skill
@@ -344,6 +344,51 @@ Reddit's search input lives **two shadow levels deep** inside `reddit-search-lar

 After submitting, press Escape to close the composer.

+## File uploads — use `browser_upload`, never click the upload button
+
+**Clicking an `<input type="file">` or the button that triggers one (X's photo button, LinkedIn's attach button, Gmail's paperclip) opens Chrome's native OS file picker. That dialog is rendered by the operating system, NOT the page, so CDP cannot see it, cannot interact with it, and the automation wedges.** This is the single most common way to lock up a browser session on any "compose with media" flow.
+
+**The only correct pattern:** call `browser_upload(selector, file_paths)`. It uses the CDP `DOM.setFileInputFiles` method, which sets the files directly on the input element's internal state as if the user had picked them — no OS dialog ever opens.
+
+```
+# WRONG — opens the native file picker, agent gets stuck
+browser_click_coordinate(photo_button_x, photo_button_y)   # ❌
+
+# RIGHT — sets the file programmatically, no dialog
+browser_upload(
+    selector="input[type='file']",          # the underlying file input
+    file_paths=["/absolute/path/to/image.png"],
+)
+```
+
+**Finding the file input.** On most modern SPAs the visible "Add photo" / "Attach" button is a styled `<button>` or `<label>`, and the real `<input type="file">` is hidden (often `display:none` or `opacity:0`, positioned offscreen, wrapped in a `<label for="...">`, or injected on click). Use `browser_evaluate` to enumerate ALL file inputs on the page first:
+
+```python
+browser_evaluate("""
+  (function(){
+    const inputs = Array.from(document.querySelectorAll('input[type="file"]'));
+    return inputs.map(el => ({
+      name: el.name || '',
+      accept: el.accept || '',
+      multiple: el.multiple,
+      id: el.id || '',
+      inViewport: (() => {
+        const r = el.getBoundingClientRect();
+        return r.width > 0 && r.height > 0;
+      })(),
+    }));
+  })();
+""")
+```
+
+Then pass the most specific selector that uniquely identifies the right input (e.g. `input[type='file'][accept*='image']` for a photo-only upload). `browser_upload` doesn't care if the input is hidden or offscreen — `DOM.setFileInputFiles` works on any valid file input node, visible or not.
+
+**X / LinkedIn / Twitter pattern.** On X (`x.com/compose/post`), the photo upload input is `input[data-testid='fileInput']` — hidden, reachable via `browser_upload`. On LinkedIn feed compose, look for `input[type='file'][accept*='image']` inside the post-creation modal after clicking "Add media" (clicking the Add-media button reveals the input but does NOT open the dialog; only clicking the SECOND layer — the "From computer" entry — would trigger the picker. Stop at the first layer, find the input, call `browser_upload`).
+
+**Verification after upload.** `DOM.setFileInputFiles` dispatches a `change` event on the input but NOT the `click` / `focus` events that some sites gate their UI on. Always verify the upload actually took effect by screenshotting the composer (the uploaded image should appear as a preview) or by checking for a "preview" / "remove" element that only exists post-upload. If verification fails, the site may be reading the file via some other bridge — fall back to reading the file bytes and pasting them via the clipboard (`navigator.clipboard.write` with a `ClipboardItem`) through `browser_evaluate`.
+
+**If a native file picker DOES open** (you clicked the wrong thing): there is no recovery via CDP. Press Escape via `browser_press("Escape")` immediately — this dismisses the OS dialog in Chrome on Linux/macOS. Then find the actual `<input type='file'>` and use `browser_upload`.
+
 ## Common pitfalls

 - **Typing into a rich-text editor without clicking first → send button stays disabled.** Draft.js (X), Lexical (Gmail, LinkedIn DMs), ProseMirror (Reddit), and React-controlled `contenteditable` elements only register input as "real" when the element received a native focus event — JS-sourced `.focus()` is not enough. `browser_type` now does this automatically via a real CDP pointer click before inserting text, but always verify the submit button's `disabled` state before clicking send. See the "ALWAYS click before typing" section above.
@@ -354,6 +399,7 @@ After submitting, press Escape to close the composer.
 - **Relying on `innerHTML` in injected scripts on LinkedIn.** Silently discarded. Use `createElement` + `appendChild`.
 - **Not waiting for SPA hydration.** `wait_until="load"` fires before React/Vue rendering on many sites. Add a 2–3 s sleep before querying for chrome elements.
 - **Using `browser_type(selector)` on LinkedIn DMs or any shadow-DOM input.** Won't find the element. Fall back to click-to-focus + `browser_press` per character.
+- **Clicking a "Photo" / "Attach" / "Upload" button to pick a file.** This opens Chrome's NATIVE OS file picker, which is rendered outside the web page and cannot be interacted with via CDP. Your automation will hang staring at an unreachable dialog. ALWAYS use `browser_upload(selector, file_paths)` against the underlying `<input type='file'>` element — see the "File uploads" section above for the full pattern. This is the single most common way to wedge a browser session on compose-with-media flows (X/LinkedIn/Gmail).
 - **Keyboard shortcuts without the `code` field.** Chrome's shortcut dispatcher ignores keyboard events that lack a `code` or `windowsVirtualKeyCode`. `browser_press(..., modifiers=[...])` populates these automatically; raw `Input.dispatchKeyEvent` calls from `browser_evaluate` may not.
 - **Taking a screenshot more than 10s after the last interaction** and expecting the highlight to still be visible. The overlay fades after 10s. Take the screenshot sooner, or re-trigger the interaction.
 - **Expecting `browser_navigate` to return when you specified `wait_until="networkidle"` on a busy site.** networkidle is approximate — some sites keep a websocket or analytics beacon open forever. Use `"load"` or `"domcontentloaded"` for reliable timing.
@@ -1,6 +1,6 @@
 ---
 name: hive.linkedin-automation
-description: Drive LinkedIn via the browser-automation toolchain. Verified flows for profile messaging, connection-request acceptance, feed composition, and search. Includes every site-specific quirk we've hit (Trusted Types CSP, #interop-outlet shadow root, Lexical composer, beforeunload draft dialog, rate limits). Verified against logged-in production 2026-04-11.
+description: Read before automating LinkedIn with browser_* tools. LinkedIn combines shadow DOM (#interop-outlet), strict Trusted Types CSP that silently drops innerHTML, Lexical composer, native beforeunload dialogs that hang the bridge, and aggressive spam filters — each has bitten us at least once. Verified flows for profile messaging, connection-request acceptance, feed composition, and search. Requires hive.browser-automation. Verified against logged-in production 2026-04-11.
 metadata:
  author: hive
  type: default-skill
@@ -98,10 +98,20 @@ textarea = browser_evaluate("""
 browser_click_coordinate(textarea['cx'], textarea['cy'])
 sleep(0.6)

-# 6. Insert text via CDP Input.insertText (browser_type does this by default now).
-#    Per-char keyDown fails on Lexical composers — the keys dispatch but
-#    the editor never turns them into text.
-browser_type(<appropriate-selector-or-skip-selector-and-use-bridge-insertText>, text)
+# 6. Insert text via document.execCommand('insertText') through browser_evaluate.
+#    This is the ONLY reliable approach for LinkedIn's Lexical composer.
+#    See the "Lexical composer quirks" section below for why browser_type
+#    with a selector does NOT work here (the contenteditable lives inside
+#    the #interop-outlet shadow root which document.querySelector can't
+#    reach). The click in step 5 already put Lexical into edit mode, so
+#    execCommand injects straight into the focused editor's state.
+browser_evaluate("""
+  (function(){
+    document.execCommand('insertText', false, %s);
+    return true;
+  })();
+""" % json.dumps(message_text))   # json.dumps gives you a safely-escaped JS string literal
+sleep(1.0)   # let Lexical commit state + enable Send button

 # 7. Find the modal Send button (filter by in-viewport, reject pinned bar)
 send = browser_evaluate("""
@@ -133,11 +143,24 @@ send = browser_evaluate("""
  })();
 """)

-# 8. ONLY click Send if it's enabled — if disabled, the editor didn't register the input.
-#    Don't click blindly; the framework state is the source of truth, not the DOM text.
-if not send['disabled']:
-    browser_click_coordinate(send['cx'], send['cy'])
-    sleep(2.5)  # wait for send + bubble render
+# 8. ONLY click Send if it's enabled — if disabled, the execCommand
+#    didn't land. DO NOT retry with a different tool; the fix is
+#    always: re-click the composer rect, re-run execCommand, re-check.
+#    The Send button's `disabled` state IS the ground truth — if
+#    Lexical registered your text, it enables the button. If it's
+#    still disabled, your text did not reach the editor, regardless
+#    of what any tool call claims.
+if send['disabled']:
+    # The editor didn't receive your text. Do NOT click Send. Do NOT
+    # fall back to browser_type with a dummy selector (see anti-pattern
+    # in Common Pitfalls). Instead: re-click the textarea rect from
+    # step 4, wait a beat, re-run the execCommand insertText from step
+    # 6. If that still fails after 2 retries, bail and surface — the
+    # modal may have been reclaimed by a stale state or auth wall.
+    raise Exception("Send button disabled after insertText — editor did not receive input")
+
+browser_click_coordinate(send['cx'], send['cy'])
+sleep(2.5)  # wait for send + bubble render
 ```

 **Verify post-send**: the composer textarea should now be empty (`innerText === ''`) and `.msg-s-event-listitem__message-bubble` count should have grown by 1. Walk the shadow tree via `browser_evaluate` to check.
@@ -223,6 +246,60 @@ if state['found'] and not state['disabled']:
    browser_click("button.share-actions__primary-action")
 ```

+## Posting WITH an image attached
+
+**Do NOT click the "Add media" / image icon inside the feed post composer to pick a file.** LinkedIn renders a styled button that opens Chrome's native OS file picker when clicked, and that dialog is unreachable via CDP — the automation will hang on an invisible modal. Use `browser_upload` directly against the hidden `<input type='file'>`:
+
+```python
+# After the post modal is open and the editor has text:
+# (A) First, click "Add media" to surface the file input
+#     (clicking THIS button reveals the input but does NOT itself open
+#     the OS picker on current LinkedIn — the picker only opens if
+#     you click the inner "Choose from your device" entry).
+media_btn = browser_get_rect("button[aria-label*='image'], button[aria-label*='photo']")
+browser_click_coordinate(media_btn.cx, media_btn.cy)
+sleep(0.8)
+
+# (B) Enumerate file inputs to find the right one
+inputs = browser_evaluate("""
+  (function(){
+    return Array.from(document.querySelectorAll('input[type="file"]'))
+      .map((el, i) => ({
+        idx: i,
+        accept: el.accept || '',
+        name: el.name || '',
+      }));
+  })();
+""")
+# Expect to see one with accept='image/*' or accept containing 'image/jpeg'
+
+# (C) Set the file programmatically — no dialog
+browser_upload(
+    selector="input[type='file'][accept*='image']",
+    file_paths=["/absolute/path/to/logo.png"],
+)
+sleep(3)  # LinkedIn shows an upload-progress bar + preview
+
+# (D) Verify the image preview rendered before clicking Post
+preview_ok = browser_evaluate("""
+  (function(){
+    // LinkedIn shows the preview as an <img> inside
+    // .share-creation-state__image-preview or similar.
+    return !!document.querySelector(
+      '.share-creation-state__preview img, .image-preview-container img'
+    );
+  })();
+""")
+if not preview_ok:
+    raise Exception("LinkedIn image upload did not render — do NOT click Post")
+
+# (E) Now click Post as usual
+browser_click("button.share-actions__primary-action")
+sleep(4)  # media post takes longer to commit than text-only
+```
+
+If the image isn't already on disk, write it first with `write_file(absolute_path, bytes)`. `browser_upload` only accepts absolute paths.
+
 ## Rate limits and safety

 LinkedIn's abuse detection is aggressive. Respect these limits:
@@ -247,8 +324,9 @@ If any of those show up, **stop the run, screenshot the state, and surface the i
 ## Common pitfalls

 - **`innerHTML` injection is silently dropped** — LinkedIn's Trusted Types CSP discards any `innerHTML = "<...>"` from injected scripts, no console error. Always use `createElement` + `appendChild` + `setAttribute` for DOM injection. `textContent`, `style.cssText`, and `.value` assignments are fine.
- **Per-char keyDown on the message composer produces empty text** — Lexical intercepts `beforeinput` and drops raw keys. Use `browser_type` (which now routes through CDP `Input.insertText`), or call `Input.insertText` directly via the bridge on the focused shadow element.
- **`browser_type(selector=...)` can't see the message composer** — it's inside `#interop-outlet` shadow. `document.querySelector('div.msg-form__contenteditable')` returns nothing. Use the shadow-walk + click-to-focus pattern above.
+- **Do NOT use `browser_type` on the message composer — use `document.execCommand('insertText', false, text)` via `browser_evaluate` instead.** The Lexical contenteditable lives inside the `#interop-outlet` shadow root which `document.querySelector` (what `browser_type` uses under the hood) cannot see. Attempts to work around this with `browser_shadow_query` fail because `browser_type` doesn't support the `>>>` shadow-pierce syntax. The ONLY reliable insert path is: (1) `browser_click_coordinate` on the composer rect (put Lexical in edit mode via a real CDP pointer click) → (2) `browser_evaluate` with `document.execCommand('insertText', false, <message>)` against the focused editor. This pattern is verified end-to-end across 15+ successful sends in session `session_20260414_113244_a98cfd66` (2026-04-14).
+- **Per-char keyDown on the message composer produces empty text** — Lexical intercepts `beforeinput` and drops raw keys. Ignore `browser_type` entirely for LinkedIn DMs; use the `execCommand('insertText')` path above.
+- **ANTI-PATTERN: "inject a dummy `<div id='dummy-target'>` and pass it as the `selector` arg to `browser_type`".** This looks tempting but fails compoundingly: `browser_type` clicks the **dummy div's** rect (not the editor's), the click lands on the Lexical wrapper's non-editable chrome, the contenteditable never receives focus, and `Input.insertText` fires against nothing. The bridge will still return `{"ok": true, "action": "type", "length": N}` because it has no way to verify the text actually landed. Symptom: Send button stays `disabled: true` forever. Fix: use `execCommand('insertText')` exactly as shown in the profile-message flow above. (See `session_20260414_114820_08bd3c4d` for the failed attempt.)
 - **Multiple Send buttons on the page** — the pinned bottom-right messaging bar has its own `msg-form__send-button` that's usually below `innerHeight`. Filter by in-viewport before clicking.
 - **`window.onbeforeunload` hangs navigation/close** — after typing in a composer, any `browser_navigate` or `close_tab` can pop a native "unsent message, leave?" confirm dialog that deadlocks the bridge. Always strip `onbeforeunload` before any navigation, and wrap composer flows in a `try/finally` that runs the cleanup block:

@@ -1,6 +1,6 @@
 ---
 name: hive.x-automation
-description: Drive X / Twitter via the browser-automation toolchain. Verified flows for posting, replying, deleting, search-and-engage, and the compose composer's Draft.js quirks. Includes the daily-reply and job-market-reply patterns distilled from the backed-up x-daily-replies and x-job-market-replies skills. Verified 2026-04-11.
+description: Read before automating X / Twitter with browser_* tools. Verified flows for post, reply, delete, search-and-engage, plus the Draft.js compose quirks that silently disable the send button. Includes the daily-reply and job-market-reply playbooks. Requires hive.browser-automation for the underlying screenshot + coordinate workflow. Verified 2026-04-11.
 metadata:
  author: hive
  type: default-skill
@@ -79,6 +79,61 @@ if state['found'] and not state['disabled']:
    browser_press("Escape")  # close any leftover modal
 ```

+## Posting a tweet WITH an image
+
+**Critical: NEVER click the photo button.** On `x.com/compose/post` the media button is a styled `<button>` that triggers Chrome's native OS file picker when clicked — that dialog is unreachable via CDP and will wedge the automation. Instead, set the file directly on the hidden `<input type='file'>` element using `browser_upload`:
+
+```python
+# 1. Open the compose modal as usual
+browser_press("n")
+sleep(1.5)
+browser_click_coordinate(ta_rect.cx, ta_rect.cy)
+sleep(0.5)
+browser_type("[data-testid='tweetTextarea_0']", tweet_text)
+
+# 2. Find the hidden file input X uses for media uploads.
+#    X's input is marked with data-testid='fileInput' and accepts
+#    image/*,video/*. It's hidden (display:none) but still mounted.
+inputs = browser_evaluate("""
+  (function(){
+    return Array.from(document.querySelectorAll('input[type="file"]'))
+      .map(el => ({
+        testid: el.getAttribute('data-testid') || '',
+        accept: el.accept || '',
+        multiple: el.multiple,
+      }));
+  })();
+""")
+# Expect to see: [{testid: 'fileInput', accept: 'image/jpeg,...', multiple: true}]
+
+# 3. Set the file WITHOUT opening any dialog
+browser_upload(
+    selector="input[data-testid='fileInput']",
+    file_paths=["/absolute/path/to/photo.png"],
+)
+sleep(2)  # X takes ~1-2s to show the preview thumbnail
+
+# 4. Verify the preview rendered before posting — if not, the upload
+#    didn't land and Post button will fail.
+preview = browser_evaluate("""
+  (function(){
+    // X renders uploaded media as an <img> with data-testid='attachments'
+    // (or similar) inside the composer.
+    const att = document.querySelector('[data-testid="attachments"] img');
+    return { hasPreview: !!att };
+  })();
+""")
+if not preview['hasPreview']:
+    raise Exception("Upload didn't render in composer — do NOT click Post")
+
+# 5. Now click Post as usual
+browser_click("[data-testid='tweetButton']")
+sleep(3)  # media upload + post takes longer than text-only
+browser_press("Escape")
+```
+
+If you don't already have the image file on disk, write it first: `write_file("/tmp/x_upload.png", base64_bytes)` or copy from a known location. `browser_upload` requires an absolute file path — relative paths and `~` expansion are not supported.
+
 ## Reply to a post flow

 The reply flow is the same shape as posting, with a few scroll / find-and-click steps before.
@@ -14,13 +14,37 @@ from framework.skills.skill_errors import SkillErrorCode, log_skill_error

 logger = logging.getLogger(__name__)

-_BEHAVIORAL_INSTRUCTION = (
-    "The following skills provide specialized instructions for specific tasks.\n"
-    "When a task matches a skill's description, read the SKILL.md at the listed\n"
-    "location to load the full instructions before proceeding.\n"
-    "When a skill references relative paths, resolve them against the skill's\n"
-    "directory (the parent of SKILL.md) and use absolute paths in tool calls."
-)
+# Upper bound on the raw `<available_skills>` XML body, in characters.
+# When the full catalog (with <description> entries) exceeds this, we fall
+# back to the compact variant that drops descriptions but keeps every skill
+# visible. Preserving awareness of every skill beats truncating entries.
+_COMPACT_THRESHOLD_CHARS = 5000
+
+_MANDATORY_HEADER_FULL = """## Skills (mandatory)
+Before replying: scan <available_skills> <description> entries.
+- If exactly one skill clearly applies: read its SKILL.md at <location> with `read_file`, then follow it.
+- If multiple could apply: choose the most specific one, then read/follow it.
+- If none clearly apply: do not read any SKILL.md.
+Constraints: never read more than one skill up front; only read after selecting.
+- When a skill drives external API writes (Gmail, Calendar, GitHub, etc.), assume rate limits: prefer fewer larger writes, avoid tight one-item loops, serialize bursts when possible, and respect 429/Retry-After.
+
+
+The following skills provide specialized instructions for specific tasks.
+Use `read_file` to load a skill's SKILL.md when the task matches its description.
+When a skill file references a relative path, resolve it against the skill directory (parent of SKILL.md) and use that absolute path in tool commands."""
+
+_MANDATORY_HEADER_COMPACT = """## Skills (mandatory)
+Before replying: scan <available_skills> <name> entries.
+- If exactly one skill clearly applies: read its SKILL.md at <location> with `read_file`, then follow it.
+- If multiple could apply: choose the most specific one, then read/follow it.
+- If none clearly apply: do not read any SKILL.md.
+Constraints: never read more than one skill up front; only read after selecting.
+- When a skill drives external API writes (Gmail, Calendar, GitHub, etc.), assume rate limits: prefer fewer larger writes, avoid tight one-item loops, serialize bursts when possible, and respect 429/Retry-After.
+
+
+The following skills provide specialized instructions for specific tasks.
+Use `read_file` to load a skill's SKILL.md when the task matches its name.
+When a skill file references a relative path, resolve it against the skill directory (parent of SKILL.md) and use that absolute path in tool commands."""


 class SkillCatalog:
@@ -61,27 +85,42 @@ class SkillCatalog:
    def to_prompt(self) -> str:
        """Generate the catalog prompt for system prompt injection.

-        Returns empty string if no community/user skills are discovered
-        (default skills are handled separately by DefaultSkillManager).
-        """
-        # All skills go through the catalog for progressive disclosure.
-        all_skills = list(self._skills.values())
+        Returns empty string when no skills are present. Otherwise returns
+        a mandatory pre-reply checklist + decision rules + rate-limit note,
+        followed by the <available_skills> XML body.

+        When the full XML body exceeds ``_COMPACT_THRESHOLD_CHARS``, the
+        compact variant is emitted instead: <description> elements are
+        dropped so every skill stays visible before any gets truncated.
+        """
+        all_skills = sorted(self._skills.values(), key=lambda s: s.name)
        if not all_skills:
            return ""

+        full_xml = self._render_xml(all_skills, compact=False)
+        if len(full_xml) <= _COMPACT_THRESHOLD_CHARS:
+            return f"{_MANDATORY_HEADER_FULL}\n\n{full_xml}"
+
+        compact_xml = self._render_xml(all_skills, compact=True)
+        return f"{_MANDATORY_HEADER_COMPACT}\n\n{compact_xml}"
+
+    @staticmethod
+    def _render_xml(skills: list[ParsedSkill], *, compact: bool) -> str:
+        """Render the `<available_skills>` block.
+
+        ``compact=True`` drops `<description>` to preserve skill awareness
+        when the catalog would otherwise blow the char budget.
+        """
        lines = ["<available_skills>"]
-        for skill in sorted(all_skills, key=lambda s: s.name):
+        for skill in skills:
            lines.append("  <skill>")
            lines.append(f"    <name>{escape(skill.name)}</name>")
-            lines.append(f"    <description>{escape(skill.description)}</description>")
+            if not compact:
+                lines.append(f"    <description>{escape(skill.description)}</description>")
            lines.append(f"    <location>{escape(skill.location)}</location>")
-            lines.append(f"    <base_dir>{escape(skill.base_dir)}</base_dir>")
            lines.append("  </skill>")
        lines.append("</available_skills>")
-
-        xml_block = "\n".join(lines)
-        return f"{_BEHAVIORAL_INSTRUCTION}\n\n{xml_block}"
+        return "\n".join(lines)

    def build_pre_activated_prompt(self, skill_names: list[str]) -> str:
        """Build prompt content for pre-activated skills.
@@ -0,0 +1,88 @@
+"""Tool-gated pre-activation of default skills.
+
+Maps tool-name prefixes to default skills whose full body should be
+injected into the system prompt whenever a matching tool is available
+to the agent. This sidesteps progressive disclosure for skills that are
+foundational to a tool family — the agent shouldn't have to discover
+them reactively after its first broken selector call.
+
+Only the foundation skill (e.g. ``hive.browser-automation``) is wired
+in here. Site-specific skills (``hive.x-automation``,
+``hive.linkedin-automation``) stay in the catalog and rely on their
+descriptions to get picked up on demand.
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import Iterable
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_SKILLS_DIR = Path(__file__).parent / "_default_skills"
+
+# (tool-name prefix, default skill directory name, display name)
+_TOOL_GATED_SKILLS: list[tuple[str, str, str]] = [
+    ("browser_", "browser-automation", "hive.browser-automation"),
+]
+
+_BODY_CACHE: dict[str, str] = {}
+
+
+def _load_body(dir_name: str) -> str:
+    """Load the markdown body of a framework default skill, cached."""
+    if dir_name in _BODY_CACHE:
+        return _BODY_CACHE[dir_name]
+
+    path = _DEFAULT_SKILLS_DIR / dir_name / "SKILL.md"
+    body = ""
+    try:
+        raw = path.read_text(encoding="utf-8")
+        # Strip YAML frontmatter (between the first two '---' fences)
+        if raw.startswith("---"):
+            parts = raw.split("---", 2)
+            if len(parts) >= 3:
+                body = parts[2].strip()
+            else:
+                body = raw.strip()
+        else:
+            body = raw.strip()
+    except OSError as exc:
+        logger.warning("Failed to read tool-gated skill '%s': %s", dir_name, exc)
+
+    _BODY_CACHE[dir_name] = body
+    return body
+
+
+def augment_catalog_for_tools(
+    base_catalog_prompt: str,
+    tool_names: Iterable[str],
+) -> str:
+    """Return the catalog prompt with tool-gated skill bodies appended.
+
+    For each entry in ``_TOOL_GATED_SKILLS`` whose prefix matches any
+    name in ``tool_names``, appends the skill's full body as a
+    ``--- Pre-Activated Skill: <name> ---`` block. When no tool-gated
+    skill matches, returns ``base_catalog_prompt`` unchanged.
+    """
+    names = {str(name) for name in tool_names if name}
+    if not names:
+        return base_catalog_prompt
+
+    blocks: list[str] = []
+    for prefix, dir_name, display in _TOOL_GATED_SKILLS:
+        if not any(n.startswith(prefix) for n in names):
+            continue
+        body = _load_body(dir_name)
+        if not body:
+            continue
+        blocks.append(f"--- Pre-Activated Skill: {display} ---\n{body}")
+
+    if not blocks:
+        return base_catalog_prompt
+
+    suffix = "\n\n".join(blocks)
+    if not base_catalog_prompt:
+        return suffix
+    return f"{base_catalog_prompt}\n\n{suffix}"
@@ -56,6 +56,8 @@ export interface ChatMessage {
  nodeId?: string;
  /** Backend execution_id for this message */
  executionId?: string;
+  /** True when the message was sent while the queen was still processing */
+  queued?: boolean;
 }

 interface ChatPanelProps {
@@ -210,6 +212,211 @@ function ToolActivityRow({ content }: { content: string }) {
  );
 }

+// --- Inline ask_user fallback ---------------------------------------------
+// Sometimes the model prints the ask_user / ask_user_multiple payload as
+// regular assistant text instead of invoking the tool. We detect that
+// payload here and render a QuestionWidget / MultiQuestionWidget inline so
+// the user still gets the nice button UI. Submissions are sent back as a
+// regular user message via onSend (there is no pending backend state to
+// fulfill, so we treat it like the user answering in chat).
+
+type AskUserInlinePayload =
+  | { kind: "single"; question: string; options: string[] }
+  | {
+      kind: "multi";
+      questions: { id: string; prompt: string; options?: string[] }[];
+    };
+
+function detectAskUserPayload(content: string): AskUserInlinePayload | null {
+  if (!content) return null;
+  let text = content.trim();
+  if (!text) return null;
+  // Strip an optional ```json ... ``` / ``` ... ``` code fence
+  const fence = text.match(/^```(?:json|JSON)?\s*([\s\S]*?)\s*```$/);
+  if (fence) text = fence[1].trim();
+  // Strip surrounding double quotes that fully wrap a JSON object
+  if (text.length >= 2 && text.startsWith('"') && text.endsWith('"')) {
+    const inner = text.slice(1, -1).trim();
+    if (inner.startsWith("{") && inner.endsWith("}")) text = inner;
+  }
+  if (!text.startsWith("{") || !text.endsWith("}")) return null;
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(text);
+  } catch {
+    return null;
+  }
+  if (!parsed || typeof parsed !== "object") return null;
+  const obj = parsed as Record<string, unknown>;
+
+  // ask_user_multiple: { questions: [{ id, prompt, options? }, ...] }
+  if (Array.isArray(obj.questions)) {
+    const raw = obj.questions as unknown[];
+    if (raw.length < 1 || raw.length > 8) return null;
+    const questions: { id: string; prompt: string; options?: string[] }[] = [];
+    for (let i = 0; i < raw.length; i++) {
+      const q = raw[i];
+      if (!q || typeof q !== "object") return null;
+      const qo = q as Record<string, unknown>;
+      const prompt =
+        typeof qo.prompt === "string"
+          ? qo.prompt
+          : typeof qo.question === "string"
+            ? qo.question
+            : null;
+      if (!prompt) return null;
+      const id = typeof qo.id === "string" && qo.id ? qo.id : `q${i}`;
+      let options: string[] | undefined;
+      if (
+        Array.isArray(qo.options) &&
+        qo.options.every((o) => typeof o === "string")
+      ) {
+        options = qo.options as string[];
+      }
+      questions.push({ id, prompt, options });
+    }
+    return { kind: "multi", questions };
+  }
+
+  // ask_user: { question: string, options: string[] }
+  const question = typeof obj.question === "string" ? obj.question : null;
+  const options =
+    Array.isArray(obj.options) &&
+    obj.options.every((o) => typeof o === "string")
+      ? (obj.options as string[])
+      : null;
+  if (!question || !options || options.length < 2) return null;
+  return { kind: "single", question, options };
+}
+
+function InlineAskUserBubble({
+  msg,
+  payload,
+  activeThread,
+  onSend,
+  queenPhase,
+  showQueenPhaseBadge = true,
+}: {
+  msg: ChatMessage;
+  payload: AskUserInlinePayload;
+  activeThread: string;
+  onSend: (
+    message: string,
+    thread: string,
+    images?: ImageContent[],
+  ) => void;
+  queenPhase?: "planning" | "building" | "staging" | "running" | "independent";
+  showQueenPhaseBadge?: boolean;
+}) {
+  const [state, setState] = useState<"pending" | "submitted" | "dismissed">(
+    "pending",
+  );
+
+  // Once the user submits an answer via the inline widget, hide the whole
+  // bubble — their reply appears right after as a normal user message.
+  if (state === "submitted") return null;
+
+  // If the user dismissed without answering, fall back to the regular
+  // MarkdownContent rendering so they can still see what the model said.
+  if (state === "dismissed") {
+    return (
+      <MessageBubble
+        msg={msg}
+        queenPhase={queenPhase}
+        showQueenPhaseBadge={showQueenPhaseBadge}
+      />
+    );
+  }
+
+  const isQueen = msg.role === "queen";
+  const color = getColor(msg.agent, msg.role);
+  const thread = msg.thread || activeThread;
+
+  const handleSingle = (answer: string) => {
+    setState("submitted");
+    onSend(answer, thread);
+  };
+
+  const handleMulti = (answers: Record<string, string>) => {
+    setState("submitted");
+    if (payload.kind !== "multi") return;
+    // Format answers as a readable, numbered list for the outgoing message.
+    const lines = payload.questions.map((q, i) => {
+      const a = answers[q.id] ?? "";
+      return `${i + 1}. ${q.prompt}\n   ${a}`;
+    });
+    onSend(lines.join("\n"), thread);
+  };
+
+  return (
+    <div className="flex gap-3">
+      <div
+        className={`flex-shrink-0 ${isQueen ? "w-9 h-9" : "w-7 h-7"} rounded-xl flex items-center justify-center`}
+        style={{
+          backgroundColor: `${color}18`,
+          border: `1.5px solid ${color}35`,
+          boxShadow: isQueen ? `0 0 12px ${color}20` : undefined,
+        }}
+      >
+        {isQueen ? (
+          <Crown className="w-4 h-4" style={{ color }} />
+        ) : (
+          <Cpu className="w-3.5 h-3.5" style={{ color }} />
+        )}
+      </div>
+      <div
+        className={`flex-1 min-w-0 ${isQueen ? "max-w-[85%]" : "max-w-[75%]"}`}
+      >
+        <div className="flex items-center gap-2 mb-1">
+          <span
+            className={`font-medium ${isQueen ? "text-sm" : "text-xs"}`}
+            style={{ color }}
+          >
+            {msg.agent}
+          </span>
+          {(!isQueen || showQueenPhaseBadge) && (
+            <span
+              className={`text-[10px] font-medium px-1.5 py-0.5 rounded-md ${
+                isQueen
+                  ? "bg-primary/15 text-primary"
+                  : "bg-muted text-muted-foreground"
+              }`}
+            >
+              {isQueen
+                ? (msg.phase ?? queenPhase) === "independent"
+                  ? "independent"
+                  : (msg.phase ?? queenPhase) === "running"
+                    ? "running"
+                    : (msg.phase ?? queenPhase) === "staging"
+                      ? "staging"
+                      : (msg.phase ?? queenPhase) === "planning"
+                        ? "planning"
+                        : "building"
+                : "Worker"}
+            </span>
+          )}
+        </div>
+        {payload.kind === "single" ? (
+          <QuestionWidget
+            inline
+            question={payload.question}
+            options={payload.options}
+            onSubmit={handleSingle}
+            onDismiss={() => setState("dismissed")}
+          />
+        ) : (
+          <MultiQuestionWidget
+            inline
+            questions={payload.questions}
+            onSubmit={handleMulti}
+            onDismiss={() => setState("dismissed")}
+          />
+        )}
+      </div>
+    </div>
+  );
+}
+
 const MessageBubble = memo(
  function MessageBubble({
    msg,
@@ -289,7 +496,9 @@ const MessageBubble = memo(
    if (isUser) {
      return (
        <div className="flex justify-end">
-          <div className="max-w-[75%] bg-primary text-primary-foreground text-sm leading-relaxed rounded-2xl rounded-br-md px-4 py-3">
+          <div
+            className={`max-w-[75%] bg-primary text-primary-foreground text-sm leading-relaxed rounded-2xl rounded-br-md px-4 py-3${msg.queued ? " animate-pulse opacity-80" : ""}`}
+          >
            {msg.images && msg.images.length > 0 && (
              <div className="flex flex-wrap gap-2 mb-2">
                {msg.images.map((img, i) => (
@@ -305,6 +514,11 @@ const MessageBubble = memo(
            {msg.content && (
              <p className="whitespace-pre-wrap break-words">{msg.content}</p>
            )}
+            {msg.queued && (
+              <span className="block text-[10px] opacity-60 mt-1 text-right">
+                queued
+              </span>
+            )}
          </div>
        </div>
      );
@@ -587,24 +801,51 @@ export default function ChatPanel({
        onScroll={handleScroll}
        className="flex-1 overflow-auto px-5 py-4 space-y-3"
      >
-        {renderItems.map((item) =>
-          item.kind === "parallel" ? (
-            <div key={item.groupId}>
-              <ParallelSubagentBubble
-                groupId={item.groupId}
-                groups={item.groups}
-              />
-            </div>
-          ) : (
-            <div key={item.msg.id}>
+        {renderItems.map((item) => {
+          if (item.kind === "parallel") {
+            return (
+              <div key={item.groupId}>
+                <ParallelSubagentBubble
+                  groupId={item.groupId}
+                  groups={item.groups}
+                />
+              </div>
+            );
+          }
+          const msg = item.msg;
+          // Detect misformatted ask_user payloads emitted as plain text and
+          // substitute the nicer widget-based bubble.  Only inspect regular
+          // agent messages — skip system rows, tool status, dividers, etc.
+          const askPayload =
+            (msg.role === "queen" || msg.role === "worker") &&
+            !msg.type &&
+            msg.content
+              ? detectAskUserPayload(msg.content)
+              : null;
+          if (askPayload) {
+            return (
+              <div key={msg.id}>
+                <InlineAskUserBubble
+                  msg={msg}
+                  payload={askPayload}
+                  activeThread={activeThread}
+                  onSend={onSend}
+                  queenPhase={queenPhase}
+                  showQueenPhaseBadge={showQueenPhaseBadge}
+                />
+              </div>
+            );
+          }
+          return (
+            <div key={msg.id}>
              <MessageBubble
-                msg={item.msg}
+                msg={msg}
                queenPhase={queenPhase}
                showQueenPhaseBadge={showQueenPhaseBadge}
              />
            </div>
-          ),
-        )}
+          );
+        })}

        {/* Show typing indicator while waiting for first queen response (disabled + empty chat) */}
        {(isWaiting || (disabled && threadMessages.length === 0)) && (
@@ -11,9 +11,15 @@ export interface MultiQuestionWidgetProps {
  questions: QuestionItem[];
  onSubmit: (answers: Record<string, string>) => void;
  onDismiss?: () => void;
+  /**
+   * When true, skip the global Enter-to-submit listener. Use this when rendering
+   * the widget inline alongside other inputs (e.g. the chat textarea) so Enter
+   * isn't hijacked from the surrounding UI.
+   */
+  inline?: boolean;
 }

-export default function MultiQuestionWidget({ questions, onSubmit, onDismiss }: MultiQuestionWidgetProps) {
+export default function MultiQuestionWidget({ questions, onSubmit, onDismiss, inline = false }: MultiQuestionWidgetProps) {
  // Per-question state: selected index (null = nothing, options.length = "Other")
  const [selections, setSelections] = useState<(number | null)[]>(
    () => questions.map(() => null),
@@ -50,8 +56,10 @@ export default function MultiQuestionWidget({ questions, onSubmit, onDismiss }:
    onSubmit(answers);
  }, [canSubmit, submitted, questions, selections, customTexts, onSubmit]);

-  // Enter to submit (only when not focused on a text input)
+  // Enter to submit (only when not focused on a text input).
+  // Skipped in inline mode so the widget doesn't hijack keys from surrounding inputs.
  useEffect(() => {
+    if (inline) return;
    const handleKeyDown = (e: KeyboardEvent) => {
      if (submitted) return;
      const target = e.target as HTMLElement;
@@ -63,7 +71,7 @@ export default function MultiQuestionWidget({ questions, onSubmit, onDismiss }:
    };
    window.addEventListener("keydown", handleKeyDown);
    return () => window.removeEventListener("keydown", handleKeyDown);
-  }, [handleSubmit, submitted]);
+  }, [handleSubmit, submitted, inline]);

  if (submitted) return null;

@@ -10,9 +10,15 @@ export interface QuestionWidgetProps {
  onSubmit: (answer: string, isOther: boolean) => void;
  /** Called when user dismisses the question without answering */
  onDismiss?: () => void;
+  /**
+   * When true, the widget does not register a global keyboard listener. Set this
+   * when rendering the widget inline alongside other inputs (e.g. a chat textarea)
+   * so Enter / number keys do not get hijacked from the surrounding UI.
+   */
+  inline?: boolean;
 }

-export default function QuestionWidget({ question, options, onSubmit, onDismiss }: QuestionWidgetProps) {
+export default function QuestionWidget({ question, options, onSubmit, onDismiss, inline = false }: QuestionWidgetProps) {
  const [selected, setSelected] = useState<number | null>(null);
  const [customText, setCustomText] = useState("");
  const [submitted, setSubmitted] = useState(false);
@@ -42,8 +48,10 @@ export default function QuestionWidget({ question, options, onSubmit, onDismiss
    }
  }, [canSubmit, submitted, isOtherSelected, customText, options, selected, onSubmit]);

-  // Keyboard: Enter to submit, number keys to select (only when text input is not focused)
+  // Keyboard: Enter to submit, number keys to select (only when text input is not focused).
+  // Skipped in inline mode so the widget doesn't hijack keys from surrounding inputs.
  useEffect(() => {
+    if (inline) return;
    const handleKeyDown = (e: KeyboardEvent) => {
      if (submitted) return;
      const inTextInput = e.target === inputRef.current;
@@ -66,7 +74,7 @@ export default function QuestionWidget({ question, options, onSubmit, onDismiss

    window.addEventListener("keydown", handleKeyDown);
    return () => window.removeEventListener("keydown", handleKeyDown);
-  }, [handleSubmit, submitted, options.length]);
+  }, [handleSubmit, submitted, options.length, inline]);

  if (submitted) return null;

@@ -238,6 +238,12 @@ export default function ColonyChat() {
  agentStateRef.current = agentState;

  const turnCounterRef = useRef<Record<string, number>>({});
+  // Maps tool_use_id → the pill message ID and tool name that was created for it.
+  // Survives turn counter resets so deferred completions (e.g. ask_user) can
+  // find and update the correct pill even after the counter changes.
+  const toolUseToPillRef = useRef<
+    Record<string, { msgId: string; name: string }>
+  >({});
  const queenPhaseRef = useRef<string>("planning");
  const queenIterTextRef = useRef<Record<string, Record<number, string>>>({});
  const suppressIntroRef = useRef(false);
@@ -271,7 +277,19 @@ export default function ColonyChat() {
            return prev.map((m, i) => (i === lastIdx ? { ...m, id: chatMsg.id } : m));
          }
        }
-        return [...prev, chatMsg];
+        // Insert in sorted position by createdAt so tool pills and queen
+        // messages interleave correctly when multiple arrive out of order.
+        const ts = chatMsg.createdAt ?? Date.now();
+        let insertIdx = prev.length - 1;
+        while (insertIdx >= 0 && (prev[insertIdx].createdAt ?? 0) > ts) {
+          insertIdx--;
+        }
+        if (insertIdx === -1 || insertIdx === prev.length - 1) {
+          return [...prev, chatMsg];
+        }
+        const next = [...prev];
+        next.splice(insertIdx + 1, 0, chatMsg);
+        return next;
      });
    },
    [],
@@ -456,6 +474,7 @@ export default function ColonyChat() {
      setGraphNodes([]);
      setAgentState(defaultAgentState());
      turnCounterRef.current = {};
+      toolUseToPillRef.current = {};
      queenPhaseRef.current = "planning";
      queenIterTextRef.current = {};
      suppressIntroRef.current = false;
@@ -770,6 +789,12 @@ export default function ColonyChat() {
            const toolUseId = (event.data?.tool_use_id as string) || "";

            const sid = event.stream_id;
+            // Track which pill message this tool belongs to so deferred
+            // completions (ask_user) can find it after the turn counter changes.
+            toolUseToPillRef.current[toolUseId] = {
+              msgId: `tool-pill-${sid}-${event.execution_id || "exec"}-${currentTurn}`,
+              name: toolName,
+            };
            setAgentState((prev) => {
              const newActive = {
                ...prev.activeToolCalls,
@@ -814,30 +839,73 @@ export default function ColonyChat() {
              appendNodeLog(event.node_id, `${ts} INFO  ${toolName} done${resultStr}`);
            }

+            // Look up the original pill message this tool belongs to.
+            // For deferred completions (ask_user), the turn counter and
+            // activeToolCalls have already been reset, so we rely on the
+            // ref recorded during tool_call_started.
+            const tracked = toolUseToPillRef.current[toolUseId];
+            delete toolUseToPillRef.current[toolUseId];
+
            const sid = event.stream_id;
+
+            // Mark done in activeToolCalls if still present (normal case)
            setAgentState((prev) => {
-              const updated = { ...prev.activeToolCalls };
-              if (updated[toolUseId]) {
-                updated[toolUseId] = { ...updated[toolUseId], done: true };
+              if (!prev.activeToolCalls[toolUseId]) return prev;
+              return {
+                ...prev,
+                activeToolCalls: {
+                  ...prev.activeToolCalls,
+                  [toolUseId]: {
+                    ...prev.activeToolCalls[toolUseId],
+                    done: true,
+                  },
+                },
+              };
+            });
+
+            // Determine the correct pill message ID
+            const pillMsgId =
+              tracked?.msgId ??
+              `tool-pill-${sid}-${event.execution_id || "exec"}-${currentTurn}`;
+            const trackedName = tracked?.name;
+
+            // Update the pill message content directly
+            setMessages((prevMsgs) => {
+              const idx = prevMsgs.findIndex((m) => m.id === pillMsgId);
+              if (idx < 0) return prevMsgs;
+
+              try {
+                const parsed = JSON.parse(prevMsgs[idx].content);
+                const tools: { name: string; done: boolean }[] =
+                  parsed.tools || [];
+
+                if (trackedName) {
+                  let marked = false;
+                  for (let i = 0; i < tools.length; i++) {
+                    if (
+                      tools[i].name === trackedName &&
+                      !tools[i].done &&
+                      !marked
+                    ) {
+                      tools[i] = { ...tools[i], done: true };
+                      marked = true;
+                    }
+                  }
+                }
+
+                const allDone =
+                  tools.length > 0 && tools.every((t) => t.done);
+                return prevMsgs.map((m, i) =>
+                  i === idx
+                    ? {
+                        ...m,
+                        content: JSON.stringify({ tools, allDone }),
+                      }
+                    : m,
+                );
+              } catch {
+                return prevMsgs;
              }
-              const tools = Object.values(updated)
-                .filter((t) => t.streamId === sid)
-                .map((t) => ({ name: t.name, done: t.done }));
-              const allDone = tools.length > 0 && tools.every((t) => t.done);
-              upsertMessage({
-                id: `tool-pill-${sid}-${event.execution_id || "exec"}-${currentTurn}`,
-                agent: agentDisplayName || event.node_id || "Agent",
-                agentColor: "",
-                content: JSON.stringify({ tools, allDone }),
-                timestamp: "",
-                type: "tool_status",
-                role,
-                thread: agentPath,
-                createdAt: eventCreatedAt,
-                nodeId: event.node_id || undefined,
-                executionId: event.execution_id || undefined,
-              });
-              return { ...prev, activeToolCalls: updated };
            });
          }
          break;
@@ -58,6 +58,12 @@ export default function QueenDM() {
  const [cloneTask, setCloneTask] = useState("");

  const turnCounterRef = useRef(0);
+  // Maps tool_use_id → the pill message ID and tool name that was created for it.
+  // Survives turn counter resets so deferred completions (e.g. ask_user) can
+  // find and update the correct pill even after llm_turn_complete bumps the counter.
+  const toolUseToPillRef = useRef<
+    Record<string, { msgId: string; name: string }>
+  >({});
  const queenIterTextRef = useRef<Record<string, Record<number, string>>>({});
  const [queenPhase, setQueenPhase] = useState<
    "planning" | "building" | "staging" | "running" | "independent"
@@ -77,6 +83,7 @@ export default function QueenDM() {
    setQueenPhase("independent");
    setInitialDraft(null);
    turnCounterRef.current = 0;
+    toolUseToPillRef.current = {};
    queenIterTextRef.current = {};
  }, []);

@@ -95,7 +102,14 @@ export default function QueenDM() {
        if (restored.length > 0 && !cancelled()) {
          restored.sort((a, b) => (a.createdAt ?? 0) - (b.createdAt ?? 0));
          setMessages(restored);
-          setIsTyping(false);
+          // Only clear typing if the history contains a completed execution;
+          // during bootstrap the queen is still processing.
+          const hasCompleted = events.some(
+            (e: AgentEvent) => e.type === "execution_completed",
+          );
+          if (hasCompleted) {
+            setIsTyping(false);
+          }
        }
      } catch {
        // No history
@@ -383,6 +397,12 @@ export default function QueenDM() {
          setIsTyping(true);
          setQueenReady(true);
          setActiveToolCalls({});
+          toolUseToPillRef.current = {};
+          // Clear queued flag on all user messages now that the queen is processing
+          setMessages((prev) => {
+            if (!prev.some((m) => m.queued)) return prev;
+            return prev.map((m) => (m.queued ? { ...m, queued: undefined } : m));
+          });
          break;

        case "execution_completed":
@@ -544,6 +564,14 @@ export default function QueenDM() {
          const toolUseId = (event.data?.tool_use_id as string) || "";
          const sid = event.stream_id;
          const execId = event.execution_id || "exec";
+          const eventCreatedAt = event.timestamp
+            ? new Date(event.timestamp).getTime()
+            : Date.now();
+
+          // Track which pill message this tool belongs to so deferred
+          // completions (ask_user) can find it after the turn counter changes.
+          const msgId = `tool-pill-${sid}-${execId}-${turnCounterRef.current}`;
+          toolUseToPillRef.current[toolUseId] = { msgId, name: toolName };

          setActiveToolCalls((prev) => {
            const newActive = {
@@ -555,7 +583,6 @@ export default function QueenDM() {
              done: t.done,
            }));
            const allDone = tools.length > 0 && tools.every((t) => t.done);
-            const msgId = `tool-pill-${sid}-${execId}-${turnCounterRef.current}`;
            const toolMsg: ChatMessage = {
              id: msgId,
              agent: queenName,
@@ -565,16 +592,29 @@ export default function QueenDM() {
              type: "tool_status",
              role: "queen",
              thread: "queen-dm",
-              createdAt: Date.now(),
+              createdAt: eventCreatedAt,
              nodeId: event.node_id || undefined,
              executionId: event.execution_id || undefined,
            };
            setMessages((prevMsgs) => {
              const idx = prevMsgs.findIndex((m) => m.id === msgId);
              if (idx >= 0) {
-                return prevMsgs.map((m, i) => (i === idx ? toolMsg : m));
+                return prevMsgs.map((m, i) =>
+                  i === idx ? { ...toolMsg, createdAt: m.createdAt ?? toolMsg.createdAt } : m,
+                );
              }
-              return [...prevMsgs, toolMsg];
+              // Insert in sorted position by createdAt
+              const ts = toolMsg.createdAt ?? Date.now();
+              let insertIdx = prevMsgs.length - 1;
+              while (insertIdx >= 0 && (prevMsgs[insertIdx].createdAt ?? 0) > ts) {
+                insertIdx--;
+              }
+              if (insertIdx === -1 || insertIdx === prevMsgs.length - 1) {
+                return [...prevMsgs, toolMsg];
+              }
+              const next = [...prevMsgs];
+              next.splice(insertIdx + 1, 0, toolMsg);
+              return next;
            });
            return newActive;
          });
@@ -583,41 +623,68 @@ export default function QueenDM() {

        case "tool_call_completed": {
          const toolUseId = (event.data?.tool_use_id as string) || "";
+
+          // Look up the original pill message this tool belongs to.
+          // For deferred completions (ask_user), the turn counter and
+          // activeToolCalls have already been reset by llm_turn_complete,
+          // so we rely on the ref recorded during tool_call_started.
+          const tracked = toolUseToPillRef.current[toolUseId];
+          delete toolUseToPillRef.current[toolUseId];
+
+          // Mark done in activeToolCalls if still present (normal case)
+          setActiveToolCalls((prev) => {
+            if (!prev[toolUseId]) return prev;
+            return {
+              ...prev,
+              [toolUseId]: { ...prev[toolUseId], done: true },
+            };
+          });
+
+          // Determine the correct pill message ID
          const sid = event.stream_id;
          const execId = event.execution_id || "exec";
+          const pillMsgId =
+            tracked?.msgId ??
+            `tool-pill-${sid}-${execId}-${turnCounterRef.current}`;
+          const toolName = tracked?.name;

-          setActiveToolCalls((prev) => {
-            const updated = { ...prev };
-            if (updated[toolUseId]) {
-              updated[toolUseId] = { ...updated[toolUseId], done: true };
-            }
-            const tools = Object.entries(updated).map(([, t]) => ({
-              name: t.name,
-              done: t.done,
-            }));
-            const allDone = tools.length > 0 && tools.every((t) => t.done);
-            const msgId = `tool-pill-${sid}-${execId}-${turnCounterRef.current}`;
-            const toolMsg: ChatMessage = {
-              id: msgId,
-              agent: queenName,
-              agentColor: "",
-              content: JSON.stringify({ tools, allDone }),
-              timestamp: "",
-              type: "tool_status",
-              role: "queen",
-              thread: "queen-dm",
-              createdAt: Date.now(),
-              nodeId: event.node_id || undefined,
-              executionId: event.execution_id || undefined,
-            };
-            setMessages((prevMsgs) => {
-              const idx = prevMsgs.findIndex((m) => m.id === msgId);
-              if (idx >= 0) {
-                return prevMsgs.map((m, i) => (i === idx ? toolMsg : m));
+          // Update the pill message content directly
+          setMessages((prevMsgs) => {
+            const idx = prevMsgs.findIndex((m) => m.id === pillMsgId);
+            if (idx < 0) return prevMsgs;
+
+            try {
+              const parsed = JSON.parse(prevMsgs[idx].content);
+              const tools: { name: string; done: boolean }[] =
+                parsed.tools || [];
+
+              if (toolName) {
+                let marked = false;
+                for (let i = 0; i < tools.length; i++) {
+                  if (
+                    tools[i].name === toolName &&
+                    !tools[i].done &&
+                    !marked
+                  ) {
+                    tools[i] = { ...tools[i], done: true };
+                    marked = true;
+                  }
+                }
              }
-              return [...prevMsgs, toolMsg];
-            });
-            return updated;
+
+              const allDone =
+                tools.length > 0 && tools.every((t) => t.done);
+              return prevMsgs.map((m, i) =>
+                i === idx
+                  ? {
+                      ...m,
+                      content: JSON.stringify({ tools, allDone }),
+                    }
+                  : m,
+              );
+            } catch {
+              return prevMsgs;
+            }
          });
          break;
        }
@@ -645,6 +712,7 @@ export default function QueenDM() {
        setPendingOptions(null);
      }

+      const isQueenBusy = isTyping;
      const userMsg: ChatMessage = {
        id: makeId(),
        agent: "You",
@@ -655,6 +723,7 @@ export default function QueenDM() {
        thread: "queen-dm",
        createdAt: Date.now(),
        images,
+        queued: isQueenBusy || undefined,
      };
      setMessages((prev) => [...prev, userMsg]);
      setIsTyping(true);
@@ -666,7 +735,7 @@ export default function QueenDM() {
        });
      }
    },
-    [sessionId, awaitingInput],
+    [sessionId, awaitingInput, isTyping],
  );

  const handleQuestionAnswer = useCallback(
@@ -699,6 +768,13 @@ export default function QueenDM() {
      await executionApi.cancelQueen(sessionId);
      setIsTyping(false);
      setIsStreaming(false);
+      setActiveToolCalls({});
+      toolUseToPillRef.current = {};
+      // Clear queued flags since the queen is now idle
+      setMessages((prev) => {
+        if (!prev.some((m) => m.queued)) return prev;
+        return prev.map((m) => (m.queued ? { ...m, queued: undefined } : m));
+      });
    } catch {
      // ignore
    }
@@ -4,7 +4,8 @@ from __future__ import annotations

 import pytest

-from framework.llm.capabilities import supports_image_tool_results
+from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results
+from framework.llm.provider import Tool


 class TestSupportsImageToolResults:
@@ -56,3 +57,56 @@ class TestSupportsImageToolResults:
        assert supports_image_tool_results("DeepSeek/deepseek-chat") is False
        assert supports_image_tool_results("OLLAMA/llama3") is False
        assert supports_image_tool_results("GPT-4o") is True
+
+
+class TestFilterToolsForModel:
+    """Verify ``filter_tools_for_model`` — the real helper used by AgentLoop."""
+
+    def test_hides_image_tool_from_text_only_model(self):
+        tools = [
+            Tool(name="read_file", description="read a file"),
+            Tool(name="browser_screenshot", description="take a screenshot", produces_image=True),
+            Tool(name="browser_snapshot", description="get page content"),
+        ]
+        filtered, hidden = filter_tools_for_model(tools, "glm-5")
+        names = [t.name for t in filtered]
+        assert "browser_screenshot" not in names
+        assert "read_file" in names
+        assert "browser_snapshot" in names
+        assert hidden == ["browser_screenshot"]
+
+    def test_keeps_image_tool_for_vision_model(self):
+        tools = [
+            Tool(name="read_file", description="read a file"),
+            Tool(name="browser_screenshot", description="take a screenshot", produces_image=True),
+        ]
+        filtered, hidden = filter_tools_for_model(tools, "claude-sonnet-4-20250514")
+        assert {t.name for t in filtered} == {"read_file", "browser_screenshot"}
+        assert hidden == []
+
+    def test_default_tools_are_not_filtered(self):
+        """Tools without produces_image (default False) are kept for all models."""
+        tools = [
+            Tool(name="read_file", description="read a file"),
+            Tool(name="web_search", description="search the web"),
+        ]
+        text_only, text_hidden = filter_tools_for_model(tools, "glm-5")
+        vision, vision_hidden = filter_tools_for_model(tools, "gpt-4o")
+        assert len(text_only) == 2 and text_hidden == []
+        assert len(vision) == 2 and vision_hidden == []
+
+    def test_empty_model_string_returns_tools_unchanged(self):
+        """Guards the ctx.llm-missing path where model is empty."""
+        tools = [
+            Tool(name="browser_screenshot", description="", produces_image=True),
+        ]
+        filtered, hidden = filter_tools_for_model(tools, "")
+        assert len(filtered) == 1
+        assert hidden == []
+
+    def test_returned_list_is_a_copy(self):
+        """Caller should be free to mutate the filtered list without affecting input."""
+        tools = [Tool(name="read_file", description="")]
+        filtered, _ = filter_tools_for_model(tools, "gpt-4o")
+        filtered.append(Tool(name="extra", description=""))
+        assert len(tools) == 1
@@ -0,0 +1,65 @@
+"""Tests for vision-only prompt block stripping in Queen nodes.
+
+Covers ``finalize_queen_prompt`` — the function that resolves
+``<!-- vision-only -->...<!-- /vision-only -->`` markers in Queen phase
+prompts before they reach the LLM. Vision-capable models see the inner
+content; text-only models see the block removed entirely.
+"""
+
+from __future__ import annotations
+
+from framework.agents.queen.nodes import finalize_queen_prompt
+
+
+class TestFinalizeQueenPrompt:
+    def test_vision_model_keeps_inner_content_and_strips_markers(self):
+        text = "before <!-- vision-only -->secret<!-- /vision-only --> after"
+        result = finalize_queen_prompt(text, has_vision=True)
+        assert result == "before secret after"
+
+    def test_text_only_model_removes_entire_block(self):
+        text = "before <!-- vision-only -->secret<!-- /vision-only --> after"
+        result = finalize_queen_prompt(text, has_vision=False)
+        assert result == "before  after"
+        assert "secret" not in result
+        assert "vision-only" not in result
+
+    def test_multiline_block_handled(self):
+        """Regex must use DOTALL so blocks can span newlines."""
+        text = (
+            "- item 1\n"
+            "<!-- vision-only -->\n"
+            "- item 2 (vision only)\n"
+            "<!-- /vision-only -->\n"
+            "- item 3\n"
+        )
+        vision = finalize_queen_prompt(text, has_vision=True)
+        text_only = finalize_queen_prompt(text, has_vision=False)
+        assert "- item 2 (vision only)" in vision
+        assert "- item 2 (vision only)" not in text_only
+        assert "- item 1" in text_only and "- item 3" in text_only
+
+    def test_multiple_blocks_in_same_text(self):
+        text = (
+            "A <!-- vision-only -->X<!-- /vision-only --> "
+            "B <!-- vision-only -->Y<!-- /vision-only --> C"
+        )
+        assert finalize_queen_prompt(text, has_vision=True) == "A X B Y C"
+        assert finalize_queen_prompt(text, has_vision=False) == "A  B  C"
+
+    def test_non_greedy_match_does_not_swallow_between_blocks(self):
+        """A naïve greedy regex would match from the first opening marker
+        to the last closing marker and wipe out the middle section. Lock
+        that down so a future refactor can't regress to greedy."""
+        text = (
+            "<!-- vision-only -->first<!-- /vision-only -->"
+            "KEEP"
+            "<!-- vision-only -->second<!-- /vision-only -->"
+        )
+        assert finalize_queen_prompt(text, has_vision=False) == "KEEP"
+        assert finalize_queen_prompt(text, has_vision=True) == "firstKEEPsecond"
+
+    def test_text_without_markers_is_unchanged(self):
+        text = "plain prompt with no markers at all"
+        assert finalize_queen_prompt(text, has_vision=True) == text
+        assert finalize_queen_prompt(text, has_vision=False) == text
@@ -94,7 +94,10 @@ class TestSkillCatalog:
        assert "<name>beta</name>" in prompt
        assert "<description>Alpha skill</description>" in prompt
        assert "<location>/p/alpha/SKILL.md</location>" in prompt
-        assert "<base_dir>/p/alpha</base_dir>" in prompt
+        # <base_dir> is intentionally not emitted — the mandatory header
+        # tells the model to resolve relative paths against the parent of
+        # SKILL.md, so the redundant element was dropped.
+        assert "<base_dir>" not in prompt

    def test_to_prompt_sorted_by_name(self):
        skills = [
@@ -130,13 +133,44 @@ class TestSkillCatalog:
        assert "<name>usr</name>" in prompt
        assert "<name>fw</name>" in prompt

-    def test_to_prompt_contains_behavioral_instruction(self):
+    def test_to_prompt_contains_mandatory_header(self):
+        """The rendered catalog must carry the mandatory pre-reply checklist
+        so soft guidance turns into a required step."""
        catalog = SkillCatalog([_make_skill(source_scope="project")])
        prompt = catalog.to_prompt()

-        assert "When a task matches a skill's description" in prompt
+        assert "## Skills (mandatory)" in prompt
+        assert "Before replying: scan <available_skills>" in prompt
+        assert "never read more than one skill up front" in prompt
+        assert "`read_file`" in prompt
        assert "SKILL.md" in prompt

+    def test_to_prompt_compact_fallback_drops_descriptions(self):
+        """When the full XML body exceeds the char threshold, the compact
+        variant drops <description> but keeps every skill's <name>."""
+        # Each skill contributes ~100+ chars with a long description.
+        # 60 skills easily pushes the body past the threshold.
+        skills = [
+            _make_skill(
+                name=f"skill-{i:03d}",
+                description="A reasonably long description " * 4,
+                location=f"/s/skill-{i:03d}/SKILL.md",
+                base_dir=f"/s/skill-{i:03d}",
+            )
+            for i in range(60)
+        ]
+        catalog = SkillCatalog(skills)
+        prompt = catalog.to_prompt()
+
+        # Mandatory header still present but uses the compact variant wording.
+        assert "## Skills (mandatory)" in prompt
+        assert "scan <available_skills> <name>" in prompt
+        # Every skill's name survives …
+        for i in range(60):
+            assert f"<name>skill-{i:03d}</name>" in prompt
+        # … but no descriptions were rendered.
+        assert "<description>" not in prompt
+
    def test_build_pre_activated_prompt(self):
        skill = _make_skill("research", body="## Deep Research\nDo thorough research.")
        catalog = SkillCatalog([skill])
@@ -1,9 +1,14 @@
 """Tests for AS-6 skill resource loading support.

 Covers:
- <base_dir> element in catalog XML
 - allowlisted_dirs property reflects trusted skill base directories
 - skill_dirs propagation to NodeContext
+
+The catalog XML previously emitted a redundant <base_dir> element next to
+each <location>. That was dropped when the mandatory header took over the
+"resolve relative paths against the parent of SKILL.md" instruction, so
+there is no longer an XML-emission test for base_dir. Programmatic access
+via ``catalog.allowlisted_dirs`` is still covered below.
 """

 from framework.skills.catalog import SkillCatalog
@@ -26,31 +31,6 @@ def _make_skill(


 class TestSkillResourceBaseDir:
-    def test_base_dir_in_xml(self):
-        """Each community skill entry should expose its base_dir in the catalog XML."""
-        skill = _make_skill("deploy", "/project/.hive/skills/deploy")
-        catalog = SkillCatalog([skill])
-        prompt = catalog.to_prompt()
-
-        assert "<base_dir>/project/.hive/skills/deploy</base_dir>" in prompt
-
-    def test_base_dir_xml_escaped(self):
-        """base_dir with XML-special chars should be escaped."""
-        skill = _make_skill("s", "/path/with <&> chars")
-        catalog = SkillCatalog([skill])
-        prompt = catalog.to_prompt()
-
-        assert "<base_dir>/path/with &lt;&amp;&gt; chars</base_dir>" in prompt
-
-    def test_base_dir_present_for_framework_skills(self):
-        """Framework-scope skills now appear in the catalog like any other scope,
-        and their base_dir is included in the XML."""
-        skill = _make_skill("fw", "/hive/_default_skills/fw", source_scope="framework")
-        catalog = SkillCatalog([skill])
-        prompt = catalog.to_prompt()
-        assert "<name>fw</name>" in prompt
-        assert "<base_dir>/hive/_default_skills/fw</base_dir>" in prompt
-
    def test_allowlisted_dirs_matches_skills(self):
        """allowlisted_dirs returns all skill base_dirs including framework ones."""
        skills = [
@@ -799,6 +799,55 @@ def test_resync_returns_false_when_credentials_unchanged(tmp_path, monkeypatch):
    assert registry.resync_mcp_servers_if_needed() is False


+class TestMcpToolProducesImageFlag:
+    """Verify _convert_mcp_tool_to_framework_tool sets produces_image from the name.
+
+    This is the detection step that the filter in AgentLoop depends on —
+    if the regex regresses, text-only models will start seeing screenshot
+    tools they can't use.
+    """
+
+    @staticmethod
+    def _mcp_tool(name: str):
+        return SimpleNamespace(
+            name=name,
+            description=f"{name} description",
+            input_schema={"type": "object", "properties": {}, "required": []},
+            server_name="test",
+        )
+
+    def test_screenshot_flagged(self):
+        registry = ToolRegistry()
+        mcp = self._mcp_tool("browser_screenshot")
+        tool = registry._convert_mcp_tool_to_framework_tool(mcp)  # noqa: SLF001
+        assert tool.produces_image is True
+
+    def test_snapshot_not_flagged(self):
+        """browser_snapshot returns a DOM tree, not an image — must not match."""
+        registry = ToolRegistry()
+        mcp = self._mcp_tool("browser_snapshot")
+        tool = registry._convert_mcp_tool_to_framework_tool(mcp)  # noqa: SLF001
+        assert tool.produces_image is False
+
+    def test_case_insensitive_match(self):
+        registry = ToolRegistry()
+        mcp = self._mcp_tool("TakeScreenshot")
+        tool = registry._convert_mcp_tool_to_framework_tool(mcp)  # noqa: SLF001
+        assert tool.produces_image is True
+
+    def test_plain_tool_not_flagged(self):
+        registry = ToolRegistry()
+        mcp = self._mcp_tool("read_file")
+        tool = registry._convert_mcp_tool_to_framework_tool(mcp)  # noqa: SLF001
+        assert tool.produces_image is False
+
+    def test_image_suffix_variants_flagged(self):
+        registry = ToolRegistry()
+        for name in ("capture_image", "render_image", "get_image", "snapshot_image"):
+            tool = registry._convert_mcp_tool_to_framework_tool(self._mcp_tool(name))  # noqa: SLF001
+            assert tool.produces_image is True, f"{name} should be flagged"
+
+
 # ---------------------------------------------------------------------------
 # Concurrency-safe flag propagation
 # ---------------------------------------------------------------------------
@@ -0,0 +1,176 @@
+# 🐝 Hive Agent v0.10.0: The Colony
+
+> ⚠️ **Breaking change.** This is a large architectural refactor of how agents work in Hive. **Old agents are no longer compatible.** Existing workspaces, custom agents, and saved sessions from pre-v0.10.0 builds will need to be recreated.
+
+---
+
+## ✨ Highlights
+
+The **Colony** introduces a new way of working: a group of specialized workers operating together to run and scale your business.
+
+The role of the **Queen** has evolved. Instead of only orchestrating, the Queen now **executes work first** to deliver immediate value, then **builds systems around that work** to create stable, repeatable business processes.
+
+You now have a full leadership team of eight Queens, each with their own identity, expertise, and voice:
+
+| Queen | Role |
+| --- | --- |
+| **Sophia** | Head of Brand & Design |
+| **Charlotte** | Head of Finance & Fundraising |
+| **Victoria** | Head of Growth |
+| **Eleanor** | Head of Legal |
+| **Rachel** | Head of Operations |
+| **Isabella** | Head of Product Strategy |
+| **Amelia** | Head of Talent |
+| **Alexandra** | Head of Technology |
+
+Start automating your business processes with your Queens today.
+
+---
+
+## 🏛️ The Colony Architecture
+
+### Queens as Identities, Not Just Orchestrators
+
+- **Queen profiles** — each queen is a YAML-backed persona (`~/.hive/agents/queens/{queen_id}/profile.yaml`) with core traits, hidden background, psychological profile, behavior triggers, and skill sets. Profiles are injected into the system prompt at session start.
+- **CEO-style queen selection** — an LLM classifier routes every new user request to the best-matching queen based on the task at hand, with structured routing diagnostics (`QueenSelection`).
+- **Queen DMs** — direct-message pages for each queen with a dedicated session flow, session switcher, and prompt library integration.
+- **Independent / PM mode** — queens run in an independent mode for planning-phase work, with a "think out loud" internal monologue surfaced through internal tags.
+- **Queen memory v2** — simplified memory implementation with reflection agent, cooldown-gated reflections, user identity, doppelganger wiring, and recall-selector for targeted retrieval.
+- **Queen lifecycle tools** — first-class tools for escalation, queen reply, and session handoff.
+
+### Colony Runtime
+
+- **Grand architecture revamp** — the framework, agent loop, runtime, graph, pipeline, executor, and node worker layers have been rewritten from the ground up. Deprecated shims and legacy orchestration paths have been removed.
+- **Colony creation flow** — colonies are created via skill, with reliable event bus subscription, worker spawning, and post-creation list refresh.
+- **Scheduled triggers** — colonies can now be woken on a cron schedule, with triggers firing directly into the owning queen's session.
+- **Simple fork** for agents, stable credential states, and improved worker execution reliability.
+
+---
+
+## 🆕 What's New
+
+### Colony & Queens
+
+- 8 default queen personas (Alexandra, Victoria, Isabella, Charlotte, Eleanor, Sophia, Amelia, Rachel) with profile YAML, examples, and behavior triggers
+- LLM-based queen selector with reasoning output
+- Queen DM page, queen session switcher, and sidebar queen item
+- Queen scope memory, role examples, and identity loading
+- Reflection agent with cooldown and improved reflection runner
+- Queen orchestrator + `routes_queens` API
+- Natural chat replies and cleaner home-prompt bootstrap
+- Queen identity for new sessions
+- `ask_user` / `ask_user_multiple` tools available in queen prompt
+- Escalation and queen-reply tools
+
+### Skills & Tools
+
+- **Learned default skills** — skills the queen has learned become part of her baseline
+- **Tool-gated skill activation** — skills only activate when their required tools are present
+- **Skills for colonies** — per-colony skill registration and loading
+- **Text-only model filter** — image-producing tools and vision-only prompt blocks are hidden from text-only models
+- **Browser skills upgrade** — improved click reliability, screenshot capture, and credential filtering
+- **Deprecated-tool removal** and alignment of Hive tool names across the codebase
+- **Ask-user widget** with fallback rendering and preserved tool pill mapping across turn boundaries for deferred completions
+- **Improved tool-call reliability** across the board (tool limit removed, tool blacklist, tool credential filter)
+- **MCP** — efficient MCP loading at initialization, default MCP bootstrapping, registered available MCP tools, fixed MCP tool initialization and registry pipeline stage
+
+### LLM & Credentials
+
+- **Key pool** for credential management with stable credential states
+- **Aden credentials storage adapter** and subscription-based LLM config activation endpoint
+- **Consolidated model config** with unified model catalog
+- **New providers** — Kimi, Hive, and Aden added to the model catalog
+- **Model switcher** UI with runtime model switching API
+- **LLM key validation endpoint** with agent errors surfaced via SSE
+- **BYOK modal** import fixes for subscription token detection
+
+### Frontend
+
+- **Home redesign** — new home, credentials, and org chart pages
+- **Colony chat** and **queen DM** pages
+- **Sidebar + header** components and global app layout/routing
+- **Model switcher, settings modal, template card**
+- **Prompt library** with search, category filtering, and UI polish
+- **Side panel** fixes and sub-agent pane light-mode support
+- **Flowchart** light-mode support and normalized settings modal sizing
+- **User profile settings** and UI enhancements
+- **Sync user profile** to global memory as `user-profile.md`; queen profile API transformation
+- Removed the old workspace GUI and its dependencies
+
+### Framework & Runtime
+
+- Architecture revamp: new runtime config, simplified agent loading, new infra for queen
+- Home hive directory structure refactor
+- Agent loading pipeline fixes, MCP registry pipeline stage fix
+- Session resume improvements: separate resume vs new-session flow for queen sessions, edge-case fix for message injection in resumed sessions
+- Strip internal tags from user-visible output
+- Colony event bus subscription fixes and shared event bus for parent visibility
+- Worker spawn and stop-worker fixes
+- Default log level and extra logging hooks
+
+---
+
+## 🐛 Bug Fixes
+
+- **Ask-user widget** — fallback when widget fails to mount
+- **Skill loading** for colonies and proper skill resolution across queen sessions
+- **Model switching** and new-chat flow no longer carry stale state
+- **Tool pill mapping** preserved across turn boundary for deferred `ask_user` completions
+- **Tool limit** removed (was capping legitimate long tool lists)
+- **Queen loading** stability fixes
+- **Side panel** rendering issues
+- **Deprecated graphs** removed from UI
+- **Home-page prompts** now reach the queen directly without waiting for the greeting to finish
+- **Colony creation** link, reframing, and post-creation refresh
+- **Build error** in colony creation path
+- **GCU system prompt** tuning
+- **Tool credential filter** correctness
+- **Screenshot** capture and browser click reliability
+- **Queen message injection** when resuming a session
+- **Internal-tag diction** fixes in surfaced output
+- **MCP tool initialization** on cold start
+- **Frontend DM** edge cases
+- **Prompt library** new-session handling for new chat
+- **Config validation** and unavailable Minimax model handling
+- **Queen identity** loading on cold boot
+- **Extra text** in queen selector JSON response parsed safely
+- **Outdated queen communication prompt** removed
+
+---
+
+## 🧹 Refactor & Cleanup
+
+- **Shatter the Eld\*n ring** — top-to-bottom refactor of the runtime core
+- **Grand clean-up** of deprecated code paths
+- **Remove deprecated shims** and old session-status tools
+- **Big test cleanup** — integration tests and component tests rewritten around the new architecture
+- **Update references** for orchestrator / host / loader renames
+- **Consolidate tests** for queen state machine and verified outcomes
+- **Remove old workspace GUI** and its dependencies
+- **Remove old "new agent" button** and deprecated entry points
+- **Home hive directory** structure refactor
+
+---
+
+## ⚠️ Breaking Changes
+
+- **Old agents are not compatible.** Custom agents authored against the pre-v0.10.0 framework will need to be re-authored against the new Queen/Colony runtime.
+- **Session format** — pre-v0.10.0 sessions cannot be resumed.
+- **Deprecated tools removed** and Hive tool names have been realigned; any external scripts referencing old tool names must be updated.
+- **Old session-status tools** removed in favor of the new queen lifecycle tools.
+- **Workspace GUI removed** — the legacy workspace UI is gone; use the new home, colony chat, and queen DM pages.
+- **MCP registry pipeline** — MCP configurations now load through the new registry; custom MCP setups may need to be re-registered.
+
+---
+
+## 🚀 Upgrading
+
+Because this release rewrites the agent runtime, the recommended upgrade path is:
+
+1. Back up `~/.hive/` if you have sessions or custom agents you want to reference.
+2. Pull `main` at the v0.10.0 tag.
+3. Let Hive initialize the new queen profiles under `~/.hive/agents/queens/`.
+4. Re-create any custom agents as colonies/queens against the new framework.
+5. Re-register any custom MCP servers through the new MCP registry.
+
+Welcome to the Colony. 🐝
@@ -255,9 +255,10 @@ def register_tools(mcp: FastMCP) -> None:
            # Clean up whitespace
            text = " ".join(text.split())

-            # Truncate if needed
+            # Truncate if needed (reserve 3 chars for the ellipsis so the
+            # final string stays within max_length)
            if len(text) > max_length:
-                text = text[:max_length] + "..."
+                text = text[: max_length - 3] + "..."

            result: dict[str, Any] = {
                "url": url,
@@ -42,6 +42,39 @@ BRIDGE_PORT = 9229
 # CDP wait_until values
 VALID_WAIT_UNTIL = {"commit", "domcontentloaded", "load", "networkidle"}

+# Fast-fail polling default for element / text waits. 5 seconds is long
+# enough to cover normal SPA render latency on loaded pages, short enough
+# that a bad selector or hallucinated element fails fast instead of
+# burning 30 wall-clock seconds per miss (the old behavior — see the
+# 2026-04-14 gemini-3-flash x.com session where 7 of 14 browser_click
+# calls each hit the 30s deadline for ~210s wasted total).
+#
+# navigate() keeps a longer default (30s) because real page loads can
+# legitimately take that long.
+DEFAULT_WAIT_TIMEOUT_MS: int = 5000
+
+# Longer default for bridge _send calls that wrap genuinely slow ops
+# (full-page screenshot, accessibility tree, navigate). Individual
+# callers can pass their own value via _send(..., timeout=...).
+_LONG_SEND_TIMEOUT_S: float = 60.0
+
+
+async def _adaptive_poll_sleep(elapsed_s: float) -> None:
+    """Sleep between DOM polls with an adaptive backoff.
+
+    Early polls are snappy (50ms) so a quickly-appearing element is
+    reported in ~100ms. Later polls back off (200ms, 500ms) so a
+    missing element doesn't thrash CDP with 300+ querySelector calls
+    before the deadline fires.
+    """
+    if elapsed_s < 1.0:
+        await asyncio.sleep(0.05)
+    elif elapsed_s < 5.0:
+        await asyncio.sleep(0.2)
+    else:
+        await asyncio.sleep(0.5)
+
+
 # Last interaction highlight per tab_id: {x, y, w, h, label, kind}
 # kind: "rect" (element) or "point" (coordinate)
 _interaction_highlights: dict[int, dict] = {}
@@ -296,9 +329,23 @@ class BeelineBridge:
        msg = str(exc).lower()
        return any(m in msg for m in self._CDP_DEAD_SESSION_MARKERS)

-    async def _cdp(self, tab_id: int, method: str, params: dict | None = None) -> dict:
+    async def _cdp(
+        self,
+        tab_id: int,
+        method: str,
+        params: dict | None = None,
+        *,
+        timeout: float | None = None,
+    ) -> dict:
        """Send a CDP command to a tab.

+        ``timeout`` (seconds) overrides the default bridge send timeout.
+        Pass a larger value for genuinely slow operations (full-page
+        screenshots over slow networks, accessibility tree on huge
+        pages) so they don't spuriously fail at the 30s floor. Pass a
+        smaller value for fast probes ("is this element present right
+        now") to fail fast.
+
        On a dead-session error (Chrome detached externally — tab closed,
        DevTools opened, cross-origin nav), evict the stale attach
        cache entry, reattach, and retry once. Without this the Python
@@ -307,7 +354,13 @@ class BeelineBridge:
        """
        start = time.perf_counter()
        try:
-            result = await self._send("cdp", tabId=tab_id, method=method, params=params or {})
+            result = await self._send(
+                "cdp",
+                tabId=tab_id,
+                method=method,
+                params=params or {},
+                timeout=timeout,
+            )
            duration_ms = (time.perf_counter() - start) * 1000
            log_cdp_command(tab_id, method, params, result, duration_ms=duration_ms)
            return result
@@ -327,7 +380,11 @@ class BeelineBridge:
                        self._cdp_attached.add(tab_id)
                        retry_start = time.perf_counter()
                        result = await self._send(
-                            "cdp", tabId=tab_id, method=method, params=params or {}
+                            "cdp",
+                            tabId=tab_id,
+                            method=method,
+                            params=params or {},
+                            timeout=timeout,
                        )
                        log_cdp_command(
                            tab_id,
@@ -594,10 +651,16 @@ class BeelineBridge:
        selector: str,
        button: str = "left",
        click_count: int = 1,
-        timeout_ms: int = 30000,
+        timeout_ms: int = DEFAULT_WAIT_TIMEOUT_MS,
    ) -> dict:
        """Click an element by selector.

+        ``timeout_ms`` controls how long we poll for the element to
+        appear in the DOM. Defaults to :data:`DEFAULT_WAIT_TIMEOUT_MS`
+        (5 s) so a missing or hallucinated selector fails fast. Pass a
+        larger value when the target genuinely needs longer to render
+        (e.g. post-navigation SPA hydration).
+
        Uses multiple fallback methods for robustness:
        1. CDP mouse events with JavaScript bounds
        2. JavaScript click() as fallback
@@ -612,8 +675,12 @@ class BeelineBridge:
        doc = await self._cdp(tab_id, "DOM.getDocument")
        root_id = doc.get("root", {}).get("nodeId")

-        # Wait for element to appear
-        deadline = asyncio.get_event_loop().time() + timeout_ms / 1000
+        # Wait for element to appear. Adaptive polling:
+        # - first 1 s at 50 ms intervals (responsive on fast pages)
+        # - next 4 s at 200 ms
+        # - rest at 500 ms
+        poll_start = asyncio.get_event_loop().time()
+        deadline = poll_start + timeout_ms / 1000
        node_id = None
        while asyncio.get_event_loop().time() < deadline:
            result = await self._cdp(
@@ -622,7 +689,7 @@ class BeelineBridge:
            node_id = result.get("nodeId")
            if node_id:
                break
-            await asyncio.sleep(0.1)
+            await _adaptive_poll_sleep(asyncio.get_event_loop().time() - poll_start)

        if not node_id:
            # Check if the element might be inside a Shadow DOM container
@@ -773,7 +840,11 @@ class BeelineBridge:
            )
            await asyncio.sleep(0.05)

-            # Mouse down
+            # Mouse down — if this hangs past the short wait budget we
+            # CANNOT claim success. The prior code swallowed TimeoutError
+            # with `pass` and returned ok=true further down, which is why
+            # the 2026-04-14 gemini session saw 7 clicks land at exactly
+            # 30s with status=ok even though the click had not landed.
            try:
                await asyncio.wait_for(
                    self._cdp(
@@ -787,14 +858,24 @@ class BeelineBridge:
                            "clickCount": click_count,
                        },
                    ),
-                    timeout=1.0,
+                    timeout=2.0,
                )
            except TimeoutError:
-                pass  # Continue even if timeout
+                return {
+                    "ok": False,
+                    "error": (
+                        f"CDP mousePressed timed out for '{selector}' — "
+                        "the click did not land. Consider browser_click_coordinate "
+                        "with an explicit rect from browser_get_rect."
+                    ),
+                }

            await asyncio.sleep(0.08)

-            # Mouse up
+            # Mouse up — same non-silent failure handling. A stuck
+            # mouseReleased means the press is still "held down" in
+            # Chrome's input state; we must surface the failure so the
+            # caller can retry or switch strategy.
            try:
                await asyncio.wait_for(
                    self._cdp(
@@ -811,7 +892,14 @@ class BeelineBridge:
                    timeout=3.0,
                )
            except TimeoutError:
-                pass  # Continue even if timeout
+                return {
+                    "ok": False,
+                    "error": (
+                        f"CDP mouseReleased timed out for '{selector}' — "
+                        "the press event fired but release did not. The page "
+                        "may be in a stuck input state; try browser_click_coordinate."
+                    ),
+                }

            w = bounds_value.get("width", 0)
            h = bounds_value.get("height", 0)
@@ -2174,7 +2262,19 @@ class BeelineBridge:
                        "scale": 1,
                    }

-                result = await self._cdp(tab_id, "Page.captureScreenshot", params)
+                # Pass the outer screenshot timeout budget to the
+                # underlying CDP call. Full-page screenshots over slow
+                # networks can legitimately take 20-40s; the default 30s
+                # _send floor used to make them fail spuriously right at
+                # the boundary. We give the CDP call the full timeout_s
+                # budget so the outer `asyncio.timeout(timeout_s)` is
+                # the only authority on how long we wait.
+                result = await self._cdp(
+                    tab_id,
+                    "Page.captureScreenshot",
+                    params,
+                    timeout=timeout_s,
+                )
                data = result.get("data")

                if not data:
@@ -2249,8 +2349,18 @@ class BeelineBridge:
            logger.error("Screenshot failed: %s", e)
            return {"ok": False, "error": str(e)}

-    async def wait_for_selector(self, tab_id: int, selector: str, timeout_ms: int = 30000) -> dict:
-        """Wait for an element to appear."""
+    async def wait_for_selector(
+        self,
+        tab_id: int,
+        selector: str,
+        timeout_ms: int = DEFAULT_WAIT_TIMEOUT_MS,
+    ) -> dict:
+        """Wait for an element to appear.
+
+        Default 5 s fast-fail. Callers that need to wait longer (e.g.
+        a known slow post-navigation render) should pass an explicit
+        ``timeout_ms``.
+        """
        await self.cdp_attach(tab_id)

        script = f"""
@@ -2259,7 +2369,8 @@ class BeelineBridge:
            }})()
        """

-        deadline = asyncio.get_event_loop().time() + timeout_ms / 1000
+        poll_start = asyncio.get_event_loop().time()
+        deadline = poll_start + timeout_ms / 1000
        while asyncio.get_event_loop().time() < deadline:
            result = await self._cdp(
                tab_id,
@@ -2272,12 +2383,21 @@ class BeelineBridge:
            found = (result or {}).get("result", {}).get("value", False)
            if found:
                return {"ok": True, "selector": selector}
-            await asyncio.sleep(0.1)
+            await _adaptive_poll_sleep(asyncio.get_event_loop().time() - poll_start)

        return {"ok": False, "error": f"Element not found within timeout: {selector}"}

-    async def wait_for_text(self, tab_id: int, text: str, timeout_ms: int = 30000) -> dict:
-        """Wait for text to appear on the page."""
+    async def wait_for_text(
+        self,
+        tab_id: int,
+        text: str,
+        timeout_ms: int = DEFAULT_WAIT_TIMEOUT_MS,
+    ) -> dict:
+        """Wait for text to appear on the page.
+
+        Default 5 s fast-fail. Same fast-fail rationale as
+        :meth:`wait_for_selector`.
+        """
        await self.cdp_attach(tab_id)

        script = f"""
@@ -2286,7 +2406,8 @@ class BeelineBridge:
            }})()
        """

-        deadline = asyncio.get_event_loop().time() + timeout_ms / 1000
+        poll_start = asyncio.get_event_loop().time()
+        deadline = poll_start + timeout_ms / 1000
        while asyncio.get_event_loop().time() < deadline:
            result = await self._cdp(
                tab_id,
@@ -2297,7 +2418,7 @@ class BeelineBridge:
            found = (result or {}).get("result", {}).get("value", False)
            if found:
                return {"ok": True, "text": text}
-            await asyncio.sleep(0.1)
+            await _adaptive_poll_sleep(asyncio.get_event_loop().time() - poll_start)

        return {"ok": False, "error": f"Text not found within timeout: {text}"}

@@ -28,7 +28,7 @@ def register_advanced_tools(mcp: FastMCP) -> None:
        text: str | None = None,
        tab_id: int | None = None,
        profile: str | None = None,
-        timeout_ms: int = 30000,
+        timeout_ms: int = 5000,
    ) -> dict:
        """
        Wait for a condition.
@@ -39,7 +39,13 @@ def register_advanced_tools(mcp: FastMCP) -> None:
            text: Wait for text to appear on page (optional)
            tab_id: Chrome tab ID (default: active tab)
            profile: Browser profile name (default: "default")
-            timeout_ms: Max wait time in ms (default: 30000)
+            timeout_ms: Max wait time in ms for the selector/text poll.
+                Default 5000ms (fast-fail). If the condition isn't met
+                within 5s the call returns {"ok": False, "error": ...}
+                and the agent can try a different approach instead of
+                burning 30s per miss. Pass a larger value (e.g. 15000)
+                only when you genuinely expect the element to take
+                longer than 5s to render.

        Returns:
            Dict with wait result
@@ -6,6 +6,7 @@ All operations go through the Beeline extension via CDP - no Playwright required

 from __future__ import annotations

+import asyncio
 import base64
 import io
 import json
@@ -277,9 +278,20 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            if annotate and target_tab in _interaction_highlights:
                highlights = [_interaction_highlights[target_tab]]

-            # Normalize to 800px wide and annotate
-            data, physical_scale, css_scale = _resize_and_annotate(
-                data, css_width, dpr=dpr, highlights=highlights, width=width
+            # Normalize to 800px wide and annotate. Offloaded to a
+            # thread because PIL Image.open/resize/ImageDraw/composite on
+            # a 2-megapixel PNG blocks for ~150-300ms of CPU — plenty to
+            # freeze the asyncio event loop and delay every concurrent
+            # tool call during a screenshot. The function is reentrant
+            # (fresh PIL Image per call, no shared state), so to_thread
+            # is safe.
+            data, physical_scale, css_scale = await asyncio.to_thread(
+                _resize_and_annotate,
+                data,
+                css_width,
+                dpr,
+                highlights,
+                width,
            )
            _screenshot_scales[target_tab] = physical_scale
            _screenshot_css_scales[target_tab] = css_scale
@@ -30,7 +30,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        profile: str | None = None,
        button: Literal["left", "right", "middle"] = "left",
        double_click: bool = False,
-        timeout_ms: int = 30000,
+        timeout_ms: int = 5000,
    ) -> dict:
        """
        Click an element on the page.
@@ -41,7 +41,13 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            profile: Browser profile name (default: "default")
            button: Mouse button to click (left, right, middle)
            double_click: Perform double-click (default: False)
-            timeout_ms: Timeout waiting for element (default: 30000)
+            timeout_ms: How long to poll for the element to appear in the
+                DOM before giving up. Default 5000ms (fast-fail). A missing
+                or hallucinated selector returns "Element not found" in
+                <=5s so the agent can try a different approach quickly.
+                Pass a larger value (e.g. 15000) ONLY when you know the
+                element will take longer than 5s to render — for example
+                right after a navigation that triggers slow hydration.

        Returns:
            Dict with click result and coordinates
@@ -113,6 +113,24 @@ class TestWebScrapeTool:
        assert isinstance(result, dict)
        assert "error" not in result

+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_truncation_respects_max_length(self, mock_pw, mock_stealth, web_scrape_fn):
+        """Truncated content (including the ellipsis) must not exceed max_length."""
+        # max_length is clamped to >=1000, so build content larger than that
+        long_text = "a" * 5000
+        html = f"<html><body>{long_text}</body></html>"
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com", max_length=1000)
+        assert "error" not in result
+        assert len(result["content"]) <= 1000
+        assert result["content"].endswith("...")
+        assert result["length"] == len(result["content"])
+
    @pytest.mark.asyncio
    @patch(_STEALTH_PATH)
    @patch(_PW_PATH)
Author	SHA1	Message	Date
Timothy	252710fb41	fix: context health and eviction	2026-04-15 11:40:45 -07:00
Richard Tang	22df99ef51	Merge remote-tracking branch 'origin/main' Release / Create Release (push) Waiting to run Details	2026-04-14 19:56:33 -07:00
Richard Tang	edc3135797	Merge branch 'feature/new-colony'	2026-04-14 19:56:08 -07:00
Richard Tang	27b15789fb	fix: skills prompts	2026-04-14 18:51:14 -07:00
RichardTang-Aden	5ba5933edc	Merge pull request #7046 from vincentjiang777/main docs: new readme	2026-04-14 18:02:49 -07:00
Timothy	50eb4b0e8f	Merge branch 'feature/colony-creation' into feature/new-colony	2026-04-14 16:34:30 -07:00
Richard Tang	3e4a4c9924	Merge remote-tracking branch 'origin/feat/text-only-tool-filter' into feature/new-colony	2026-04-14 16:29:19 -07:00
Richard Tang	c47987e73c	fix: ask user widget fallback	2026-04-14 16:27:12 -07:00
Timothy	256b52b818	fix: skills for colonies	2026-04-14 16:23:17 -07:00
Richard Tang	8f5daf0569	fix: swtiching model and new chat	2026-04-14 16:04:07 -07:00
bryan	af5c72e785	feat: hide image-producing tools and vision-only prompt blocks from text-only models	2026-04-14 12:50:44 -07:00
Timothy	958bafea29	fix: tool gated skill activation	2026-04-14 11:17:03 -07:00
bryan	5cdc01cb8c	fix: preserve tool pill mapping across turn boundary for deferred ask_user completions	2026-04-14 10:56:38 -07:00
Timothy	6979ea825d	fix: remove tool limit	2026-04-14 10:35:08 -07:00
Timothy	d6093a560f	Merge branch 'feature/new-colony' into feature/colony-creation	2026-04-14 10:19:24 -07:00
Hundao	2f58cce781	fix(tools): web_scrape truncation no longer exceeds max_length (#7044 ) The previous code did `text[:max_length] + "..."`, which made the returned content always 3 chars longer than the requested max_length. Reserve room for the ellipsis inside the limit so the contract holds. Fixes #2098	2026-04-14 14:24:42 +08:00
Richard Tang	ab76a66646	fix: queen loading	2026-04-13 22:39:39 -07:00
Richard Tang	c575ff3fe7	feat: queen messages improvements	2026-04-13 22:31:49 -07:00
vincentjiang777	9dc214cfd2	Merge branch 'aden-hive:main' into main	2026-04-10 20:35:42 -07:00