fix: context health and eviction

2026-04-15 11:40:45 -07:00
parent 22df99ef51
commit 252710fb41
13 changed files with 633 additions and 108 deletions
@@ -39,7 +39,12 @@
      "Bash(bun run:*)",
      "Bash(npx eslint:*)",
      "Bash(npm run:*)",
-      "Bash(npm test:*)"
+      "Bash(npm test:*)",
+      "Bash(grep -n \"PIL\\\\|Image\\\\|to_thread\\\\|run_in_executor\" /home/timothy/aden/hive/tools/src/gcu/browser/*.py /home/timothy/aden/hive/tools/src/gcu/browser/tools/*.py)",
+      "WebFetch(domain:docs.litellm.ai)",
+      "Bash(cat /home/timothy/aden/hive/.venv/lib/python3.11/site-packages/litellm-*.dist-info/METADATA)",
+      "Bash(find \"/home/timothy/.hive/agents/queens/queen_brand_design/sessions/session_20260415_100751_d49f4c28/\" -type f -name \"*.json*\" -exec grep -l \"协日\" {} \\\\;)",
+      "Bash(grep -v ':0$')"
    ],
    "additionalDirectories": [
      "/home/timothy/.hive/skills/writing-hive-skills",
@@ -3196,7 +3196,9 @@ class AgentLoop(AgentProtocol):
                        result = _build_tool_error_result(tc, raw)
                    else:
                        result = raw
-                    results_by_id[tc.tool_use_id] = self._truncate_tool_result(result, tc.tool_name)
+                    results_by_id[tc.tool_use_id] = await self._truncate_tool_result(
+                        result, tc.tool_name
+                    )

            # Phase 3: record results into conversation in original order,
            # build logged/real lists, and publish completed events.
@@ -3331,6 +3333,24 @@ class AgentLoop(AgentProtocol):
                    False,
                )

+            # --- Image eviction: strip old screenshot image_content ---
+            # Screenshots from browser_screenshot are inlined as base64
+            # data URLs in message.image_content. Each screenshot costs
+            # ~250k tokens when the provider counts base64 as text
+            # (gemini, most non-Anthropic providers). Four screenshots
+            # in one conversation blew through gemini's 1M context in
+            # session_20260415_104727_5c4ed7ff and caused garbage
+            # output ("协日" as the final assistant text). We evict
+            # aggressively after every tool batch — independent of the
+            # char-based usage_ratio, which severely underestimates
+            # image cost (counts each image as ~2000 tokens vs the
+            # ~250k actually billed). Text metadata stays on the
+            # evicted messages so the agent can still reason about
+            # "I took a screenshot at step N".
+            _max_imgs = self._config.max_retained_screenshots
+            if _max_imgs >= 0:
+                await conversation.evict_old_images(keep_latest=_max_imgs)
+
            # --- Mid-turn pruning: prevent context blowup within a single turn ---
            if conversation.usage_ratio() >= 0.6:
                protect = max(2000, self._config.max_context_tokens // 12)
@@ -3655,7 +3675,7 @@ class AgentLoop(AgentProtocol):
            max_chars=max_chars,
        )

-    def _truncate_tool_result(
+    async def _truncate_tool_result(
        self,
        result: ToolResult,
        tool_name: str,
@@ -3671,8 +3691,33 @@ class AgentLoop(AgentProtocol):
        - Large results (> limit): preview + file reference
        - Errors: pass through unchanged
        - read_file results: truncate with pagination hint (no re-spill)
+
+        For large results this does a synchronous JSON round-trip
+        (``json.loads`` + pretty-print ``json.dumps(indent=2)``) plus a
+        file write. On big payloads — web_search, web_fetch, full-page
+        extractions — this can block the event loop for hundreds of ms
+        per call. We offload to a worker thread so concurrent tool
+        executions keep running while one large result is being
+        pretty-printed and spilled to disk.
        """
-        return truncate_tool_result(
+        # Fast path: small results don't need thread offload. The
+        # function only touches disk / does heavy JSON work when the
+        # result exceeds either the truncation or spillover threshold,
+        # so cheap pass-throughs stay on the main loop.
+        needs_offload = (
+            len(result.content) > 10_000
+            and not result.is_error
+        )
+        if not needs_offload:
+            return truncate_tool_result(
+                result=result,
+                tool_name=tool_name,
+                max_tool_result_chars=self._config.max_tool_result_chars,
+                spillover_dir=self._config.spillover_dir,
+                next_spill_filename_fn=self._next_spill_filename,
+            )
+        return await asyncio.to_thread(
+            truncate_tool_result,
            result=result,
            tool_name=tool_name,
            max_tool_result_chars=self._config.max_tool_result_chars,
@@ -162,10 +162,17 @@ def update_run_cursor(
 def _extract_spillover_filename(content: str) -> str | None:
    """Extract spillover filename from a tool result annotation.

-    Matches patterns produced by EventLoopNode._truncate_tool_result():
-        - Large result:  "saved to 'web_search_1.txt'"
-        - Small result:  "[Saved to 'web_search_1.txt']"
+    Matches patterns produced by ``truncate_tool_result``:
+        - New large-result header: "Full result saved at: /abs/path/file.txt"
+        - Legacy bracketed trailer: "[Saved to 'file.txt']"  (pre-2026-04-15,
+          retained here so cold conversations still resolve)
    """
+    # New prose format — ``saved at: <absolute path>``, terminated by
+    # newline or end-of-string.
+    match = re.search(r"[Ss]aved at:\s*(\S+)", content)
+    if match:
+        return match.group(1)
+    # Legacy format.
    match = re.search(r"[Ss]aved to '([^']+)'", content)
    return match.group(1) if match else None

@@ -878,12 +885,14 @@ class NodeConversation:

            if spillover:
                placeholder = (
-                    f"[Pruned tool result: {orig_len} chars. "
-                    f"Full data in '{spillover}'. "
-                    f"Use read_file('{spillover}') to retrieve.]"
+                    f"Pruned tool result ({orig_len:,} chars) cleared from context. "
+                    f"Full data saved at: {spillover}\n"
+                    f"Read the complete data with read_file(path='{spillover}')."
                )
            else:
-                placeholder = f"[Pruned tool result: {orig_len} chars cleared from context.]"
+                placeholder = (
+                    f"Pruned tool result ({orig_len:,} chars) cleared from context."
+                )

            self._messages[i] = Message(
                seq=msg.seq,
@@ -905,6 +914,81 @@ class NodeConversation:
        self._last_api_input_tokens = None
        return count

+    async def evict_old_images(self, keep_latest: int = 2) -> int:
+        """Strip ``image_content`` from older messages, keeping the most recent.
+
+        Screenshots from ``browser_screenshot`` are inlined into the
+        message's ``image_content`` as base64 data URLs. Each screenshot
+        costs ~250k tokens when the provider counts the base64 as
+        text — four screenshots push a conversation over gemini's 1M
+        context limit and trigger out-of-context garbage output (see
+        ``session_20260415_104727_5c4ed7ff`` for the terminal case
+        where the model emitted ``协日`` as its final text then stopped).
+
+        This method walks backward through messages and keeps
+        ``image_content`` intact on the most recent ``keep_latest``
+        messages that have images. Older messages get their
+        ``image_content`` nulled out — the text content (metadata
+        like url, dimensions, scale hints) stays, but the raw bytes
+        are dropped. Storage is updated too so cold-restore sees the
+        same evicted state.
+
+        Run this right after every tool result is recorded so image
+        context stays bounded even within a single iteration (the
+        compaction pipeline only fires at iteration boundaries, too
+        late for a single turn that takes 4 screenshots).
+
+        Returns the number of messages whose image_content was evicted.
+        """
+        if not self._messages or keep_latest < 0:
+            return 0
+
+        # Find messages carrying images, walking newest → oldest.
+        image_indices: list[int] = []
+        for i in range(len(self._messages) - 1, -1, -1):
+            if self._messages[i].image_content:
+                image_indices.append(i)
+
+        # Nothing to evict if we have ≤ keep_latest images total.
+        if len(image_indices) <= keep_latest:
+            return 0
+
+        # Evict everything past the first keep_latest (newest) entries.
+        to_evict = image_indices[keep_latest:]
+        evicted = 0
+        for idx in to_evict:
+            msg = self._messages[idx]
+            self._messages[idx] = Message(
+                seq=msg.seq,
+                role=msg.role,
+                content=msg.content,
+                tool_use_id=msg.tool_use_id,
+                tool_calls=msg.tool_calls,
+                is_error=msg.is_error,
+                phase_id=msg.phase_id,
+                is_transition_marker=msg.is_transition_marker,
+                is_client_input=msg.is_client_input,
+                image_content=None,  # ← dropped
+                is_skill_content=msg.is_skill_content,
+                run_id=msg.run_id,
+            )
+            evicted += 1
+            if self._store:
+                await self._store.write_part(
+                    msg.seq, self._messages[idx].to_storage_dict()
+                )
+
+        if evicted:
+            # Reset token estimate — image blocks no longer contribute.
+            self._last_api_input_tokens = None
+            logger.info(
+                "evict_old_images: dropped image_content from %d message(s), "
+                "kept %d most recent",
+                evicted,
+                keep_latest,
+            )
+        return evicted
+
    async def compact(
        self,
        summary: str,
@@ -1165,16 +1249,18 @@ class NodeConversation:
            # Nothing to save — skip file creation
            conv_filename = ""

-        # Build reference message
+        # Build reference message. Prose format (no brackets) — see the
+        # poison-pattern note on truncate_tool_result. Frontier models
+        # autocomplete `[...']` trailers into their own text turns.
        ref_parts: list[str] = []
        if conv_filename:
            full_path = str((spill_path / conv_filename).resolve())
            ref_parts.append(
-                f"[Previous conversation saved to '{full_path}'. "
-                f"Use read_file('{conv_filename}') to review if needed.]"
+                f"Previous conversation saved at: {full_path}\n"
+                f"Read the full transcript with read_file('{conv_filename}')."
            )
        elif not collapsed_msgs:
-            ref_parts.append("[Previous freeform messages compacted.]")
+            ref_parts.append("(Previous freeform messages compacted.)")

        # Aggressive: add collapsed tool-call history to the reference
        if collapsed_msgs:
@@ -102,12 +102,14 @@ def microcompact(
        orig_len = len(msg.content)
        if spillover:
            placeholder = (
-                f"[Old tool result cleared: {orig_len} chars. "
-                f"Full data in '{spillover}'. "
-                f"Use read_file('{spillover}') to retrieve.]"
+                f"Old tool result ({orig_len:,} chars) cleared from context. "
+                f"Full data saved at: {spillover}\n"
+                f"Read the complete data with read_file(path='{spillover}')."
            )
        else:
-            placeholder = f"[Old tool result cleared: {orig_len} chars.]"
+            placeholder = (
+                f"Old tool result ({orig_len:,} chars) cleared from context."
+            )

        # Mutate in-place (microcompact is synchronous, no store writes)
        conversation._messages[i] = Message(
@@ -142,7 +144,14 @@ def _find_tool_name_for_result(messages: list[Message], tool_msg: Message) -> st


 def _extract_spillover_filename_inline(content: str) -> str | None:
-    """Quick inline check for spillover filename in tool result content."""
+    """Quick inline check for spillover filename in tool result content.
+
+    Matches both the new prose format ("saved at: /path") and the
+    legacy bracketed trailer ("saved to '/path'").
+    """
+    match = re.search(r"saved at:\s*(\S+)", content, re.IGNORECASE)
+    if match:
+        return match.group(1)
    match = re.search(r"saved to '([^']+)'", content, re.IGNORECASE)
    return match.group(1) if match else None

@@ -215,14 +215,30 @@ def truncate_tool_result(
    """Persist tool result to file and optionally truncate for context.

    When *spillover_dir* is configured, EVERY non-error tool result is
-    saved to a file (short filename like ``web_search_1.txt``).  A
-    ``[Saved to '...']`` annotation is appended so the reference
-    survives pruning and compaction.
+    written to disk for debugging. The LLM-visible content is then
+    shaped to avoid a **poison pattern** that we traced on 2026-04-15
+    through a gemini-3.1-pro-preview queen session: the prior format
+    appended ``\\n\\n[Saved to '/abs/path/file.txt']`` after every
+    small result, and frontier pattern-matching models (gemini 3.x in
+    particular) learned to autocomplete the `[Saved to '...']` trailer
+    in their own assistant turns, eventually degenerating into echoing
+    the whole tool result instead of deciding what to do next. See
+    ``session_20260415_100751_d49f4c28/conversations/parts/0000000056.json``
+    for the terminal case where the model's "text" output was the full
+    tool_result JSON.

-    - Small results (≤ limit): full content kept + file annotation
-    - Large results (> limit): preview + file reference
-    - Errors: pass through unchanged
-    - read_file results: truncate with pagination hint (no re-spill)
+    Rules after the fix:
+    - **Small results (≤ limit):** pass content through unchanged. No
+      trailer. No annotation. The full content is already in the
+      message; the disk copy is for debugging only.
+    - **Large results (> limit):** preview + file reference, but
+      formatted as plain prose instead of a bracketed ``[...]``
+      pattern. Structured JSON metadata ("_saved_to") is embedded
+      inside the JSON body when the preview is JSON-shaped so the
+      model can locate the full file without seeing a mimicry-prone
+      bracket token outside the body.
+    - **Errors:** pass through unchanged.
+    - **read_file results:** truncate with pagination hint (no re-spill).
    """
    limit = max_tool_result_chars

@@ -252,18 +268,20 @@ def truncate_tool_result(
        else:
            preview_block = result.content[:PREVIEW_CAP] + "…"

+        # Prose header (no brackets).
        header = (
-            f"[{tool_name} result: {len(result.content):,} chars — "
-            f"too large for context. Use offset_bytes/limit_bytes "
-            f"parameters to read smaller chunks.]"
+            f"Tool `{tool_name}` returned {len(result.content):,} characters "
+            f"(too large for context). Use offset_bytes / limit_bytes "
+            f"parameters to paginate smaller chunks."
        )
        if metadata_str:
            header += f"\n\nData structure:\n{metadata_str}"
        header += (
-            "\n\nWARNING: This is an INCOMPLETE preview. Do NOT draw conclusions or counts from it."
+            "\n\nWARNING: the preview below is a SAMPLE only — do NOT "
+            "draw counts, totals, or conclusions from it."
        )

-        truncated = f"{header}\n\nPreview (small sample only):\n{preview_block}"
+        truncated = f"{header}\n\nPreview (truncated):\n{preview_block}"
        logger.info(
            "%s result truncated: %d → %d chars (use offset/limit to paginate)",
            tool_name,
@@ -301,7 +319,10 @@ def truncate_tool_result(

        if limit > 0 and len(result.content) > limit:
            # Large result: build a small, metadata-rich preview so the
-            # LLM cannot mistake it for the complete dataset.
+            # LLM cannot mistake it for the complete dataset. The
+            # preview is introduced as plain prose (no bracketed
+            # ``[Result from …]`` token) so it doesn't prime the model
+            # to autocomplete the same pattern in its next turn.
            PREVIEW_CAP = 5000

            # Extract structural metadata (array lengths, key names)
@@ -316,21 +337,22 @@ def truncate_tool_result(
            else:
                preview_block = result.content[:PREVIEW_CAP] + "…"

-            # Assemble header with structural info + warning
+            # Prose header (no brackets). Absolute path still surfaced
+            # so the agent can read the full file, but it's framed as
+            # a sentence, not a bracketed trailer.
            header = (
-                f"[Result from {tool_name}: {len(result.content):,} chars — "
-                f"too large for context, saved to '{abs_path}'.]\n"
+                f"Tool `{tool_name}` returned {len(result.content):,} characters "
+                f"(too large for context). Full result saved at: {abs_path}\n"
+                f"Read the complete data with read_file(path='{abs_path}').\n"
            )
            if metadata_str:
-                header += f"\nData structure:\n{metadata_str}"
+                header += f"\nData structure:\n{metadata_str}\n"
            header += (
-                f"\n\nWARNING: The preview below is INCOMPLETE. "
-                f"Do NOT draw conclusions or counts from it. "
-                f"Use read_file(path='{abs_path}') to read the "
-                f"full data before analysis."
+                "\nWARNING: the preview below is a SAMPLE only — do NOT "
+                "draw counts, totals, or conclusions from it."
            )

-            content = f"{header}\n\nPreview (small sample only):\n{preview_block}"
+            content = f"{header}\n\nPreview (truncated):\n{preview_block}"
            logger.info(
                "Tool result spilled to file: %s (%d chars → %s)",
                tool_name,
@@ -338,10 +360,22 @@ def truncate_tool_result(
                abs_path,
            )
        else:
-            # Small result: keep full content + annotation with absolute path
-            content = f"{result.content}\n\n[Saved to '{abs_path}']"
+            # Small result: pass content through UNCHANGED.
+            #
+            # The prior design appended `\n\n[Saved to '/abs/path']`
+            # after every small result so the agent could re-read the
+            # file later. But (a) the full content is already in the
+            # message, so there's nothing to re-read; (b) the
+            # `[Saved to '…']` trailer is a repeating token pattern
+            # that frontier pattern-matching models autocomplete into
+            # their own assistant turns, eventually echoing whole tool
+            # results as "text" instead of making decisions. Dropping
+            # the trailer entirely kills the poison pattern. Spilled
+            # files on disk still exist for debugging — they just
+            # aren't advertised in the LLM-visible message.
+            content = result.content
            logger.info(
-                "Tool result saved to file: %s (%d chars → %s)",
+                "Tool result saved to file: %s (%d chars → %s, no trailer)",
                tool_name,
                len(result.content),
                filename,
@@ -373,15 +407,17 @@ def truncate_tool_result(
        else:
            preview_block = result.content[:PREVIEW_CAP] + "…"

+        # Prose header (no brackets) — see docstring for the poison
+        # pattern that the bracket format triggered.
        header = (
-            f"[Result from {tool_name}: {len(result.content):,} chars — "
-            f"truncated to fit context budget.]"
+            f"Tool `{tool_name}` returned {len(result.content):,} characters "
+            f"(truncated to fit context budget — no spillover dir configured)."
        )
        if metadata_str:
            header += f"\n\nData structure:\n{metadata_str}"
        header += (
-            "\n\nWARNING: This is an INCOMPLETE preview. "
-            "Do NOT draw conclusions or counts from the preview alone."
+            "\n\nWARNING: the preview below is a SAMPLE only — do NOT "
+            "draw counts, totals, or conclusions from it."
        )

        truncated = f"{header}\n\n{preview_block}"
@@ -2,6 +2,7 @@

 from __future__ import annotations

+import asyncio
 import json
 import logging
 import time
@@ -83,6 +84,23 @@ class LoopConfig:
    max_tool_result_chars: int = 30_000
    spillover_dir: str | None = None

+    # Image retention in conversation history.
+    # Screenshots from ``browser_screenshot`` are inlined as base64
+    # data URLs inside message ``image_content``. Each full-page
+    # screenshot costs ~250k tokens when the provider counts the
+    # base64 as text (gemini, most non-Anthropic providers). Four
+    # screenshots in one conversation push gemini's 1M context over
+    # the limit and the model starts emitting garbage.
+    #
+    # The framework strips image_content from older messages after
+    # every tool-result batch, keeping only the most recent N
+    # screenshots. The text metadata on evicted messages (url, size,
+    # scale hints) is preserved so the agent can still reason about
+    # "I took a screenshot at step N that showed the compose modal".
+    # Raise this only if you genuinely need longer visual history AND
+    # you know your provider is using native image tokenization.
+    max_retained_screenshots: int = 2
+
    # set_output value spilling.
    max_output_value_chars: int = 2_000

@@ -166,7 +184,7 @@ class OutputAccumulator:

    async def set(self, key: str, value: Any) -> None:
        """Set a key-value pair, auto-spilling large values to files."""
-        value = self._auto_spill(key, value)
+        value = await self._auto_spill(key, value)
        self.values[key] = value
        if self.store:
            cursor = await self.store.read_cursor() or {}
@@ -175,41 +193,67 @@ class OutputAccumulator:
            cursor["outputs"] = outputs
            await self.store.write_cursor(cursor)

-    def _auto_spill(self, key: str, value: Any) -> Any:
-        """Save large values to a file and return a reference string."""
+    async def _auto_spill(self, key: str, value: Any) -> Any:
+        """Save large values to a file and return a reference string.
+
+        Runs the JSON serialization and file write on a worker thread
+        so they don't block the asyncio event loop. For a 100k-char
+        dict this used to freeze every concurrent tool call for ~50ms
+        of ``json.dumps(indent=2)`` + a sync disk write; for bigger
+        payloads or slow storage (NFS, networked FS) the freeze was
+        proportionally worse.
+        """
        if self.max_value_chars <= 0 or not self.spillover_dir:
            return value

-        val_str = json.dumps(value, ensure_ascii=False) if not isinstance(value, str) else value
-        if len(val_str) <= self.max_value_chars:
+        # Cheap size probe first — if the value is already a short
+        # string we can skip both the JSON round-trip and the thread
+        # hop entirely.
+        if isinstance(value, str) and len(value) <= self.max_value_chars:
            return value

-        spill_path = Path(self.spillover_dir)
-        spill_path.mkdir(parents=True, exist_ok=True)
-        ext = ".json" if isinstance(value, (dict, list)) else ".txt"
-        filename = f"output_{key}{ext}"
-        write_content = (
-            json.dumps(value, indent=2, ensure_ascii=False)
-            if isinstance(value, (dict, list))
-            else str(value)
-        )
-        file_path = spill_path / filename
-        file_path.write_text(write_content, encoding="utf-8")
-        file_size = file_path.stat().st_size
-        logger.info(
-            "set_output value auto-spilled: key=%s, %d chars -> %s (%d bytes)",
-            key,
-            len(val_str),
-            filename,
-            file_size,
-        )
-        # Use absolute path so parent agents can find files from subagents
-        abs_path = str(file_path.resolve())
-        return (
-            f"[Saved to '{abs_path}' ({file_size:,} bytes). "
-            f"Use read_file(path='{abs_path}') "
-            f"to access full data.]"
-        )
+        def _spill_sync() -> Any:
+            # JSON serialization for size check (only for non-strings).
+            if isinstance(value, str):
+                val_str = value
+            else:
+                val_str = json.dumps(value, ensure_ascii=False)
+            if len(val_str) <= self.max_value_chars:
+                return value
+
+            spill_path = Path(self.spillover_dir)
+            spill_path.mkdir(parents=True, exist_ok=True)
+            ext = ".json" if isinstance(value, (dict, list)) else ".txt"
+            filename = f"output_{key}{ext}"
+            write_content = (
+                json.dumps(value, indent=2, ensure_ascii=False)
+                if isinstance(value, (dict, list))
+                else str(value)
+            )
+            file_path = spill_path / filename
+            file_path.write_text(write_content, encoding="utf-8")
+            file_size = file_path.stat().st_size
+            logger.info(
+                "set_output value auto-spilled: key=%s, %d chars -> %s (%d bytes)",
+                key,
+                len(val_str),
+                filename,
+                file_size,
+            )
+            # Use absolute path so parent agents can find files from subagents.
+            #
+            # Prose format (no brackets) — same fix as tool_result_handler:
+            # frontier pattern-matching models autocomplete bracketed
+            # `[Saved to '...']` trailers into their own assistant turns,
+            # eventually degenerating into echoing the file path as text.
+            # Keep the path accessible but frame it as plain prose.
+            abs_path = str(file_path.resolve())
+            return (
+                f"Output saved at: {abs_path} ({file_size:,} bytes). "
+                f"Read the full data with read_file(path='{abs_path}')."
+            )
+
+        return await asyncio.to_thread(_spill_sync)

    def get(self, key: str) -> Any | None:
        return self.values.get(key)
@@ -344,6 +344,51 @@ Reddit's search input lives **two shadow levels deep** inside `reddit-search-lar

 After submitting, press Escape to close the composer.

+## File uploads — use `browser_upload`, never click the upload button
+
+**Clicking an `<input type="file">` or the button that triggers one (X's photo button, LinkedIn's attach button, Gmail's paperclip) opens Chrome's native OS file picker. That dialog is rendered by the operating system, NOT the page, so CDP cannot see it, cannot interact with it, and the automation wedges.** This is the single most common way to lock up a browser session on any "compose with media" flow.
+
+**The only correct pattern:** call `browser_upload(selector, file_paths)`. It uses the CDP `DOM.setFileInputFiles` method, which sets the files directly on the input element's internal state as if the user had picked them — no OS dialog ever opens.
+
+```
+# WRONG — opens the native file picker, agent gets stuck
+browser_click_coordinate(photo_button_x, photo_button_y)   # ❌
+
+# RIGHT — sets the file programmatically, no dialog
+browser_upload(
+    selector="input[type='file']",          # the underlying file input
+    file_paths=["/absolute/path/to/image.png"],
+)
+```
+
+**Finding the file input.** On most modern SPAs the visible "Add photo" / "Attach" button is a styled `<button>` or `<label>`, and the real `<input type="file">` is hidden (often `display:none` or `opacity:0`, positioned offscreen, wrapped in a `<label for="...">`, or injected on click). Use `browser_evaluate` to enumerate ALL file inputs on the page first:
+
+```python
+browser_evaluate("""
+  (function(){
+    const inputs = Array.from(document.querySelectorAll('input[type="file"]'));
+    return inputs.map(el => ({
+      name: el.name || '',
+      accept: el.accept || '',
+      multiple: el.multiple,
+      id: el.id || '',
+      inViewport: (() => {
+        const r = el.getBoundingClientRect();
+        return r.width > 0 && r.height > 0;
+      })(),
+    }));
+  })();
+""")
+```
+
+Then pass the most specific selector that uniquely identifies the right input (e.g. `input[type='file'][accept*='image']` for a photo-only upload). `browser_upload` doesn't care if the input is hidden or offscreen — `DOM.setFileInputFiles` works on any valid file input node, visible or not.
+
+**X / LinkedIn / Twitter pattern.** On X (`x.com/compose/post`), the photo upload input is `input[data-testid='fileInput']` — hidden, reachable via `browser_upload`. On LinkedIn feed compose, look for `input[type='file'][accept*='image']` inside the post-creation modal after clicking "Add media" (clicking the Add-media button reveals the input but does NOT open the dialog; only clicking the SECOND layer — the "From computer" entry — would trigger the picker. Stop at the first layer, find the input, call `browser_upload`).
+
+**Verification after upload.** `DOM.setFileInputFiles` dispatches a `change` event on the input but NOT the `click` / `focus` events that some sites gate their UI on. Always verify the upload actually took effect by screenshotting the composer (the uploaded image should appear as a preview) or by checking for a "preview" / "remove" element that only exists post-upload. If verification fails, the site may be reading the file via some other bridge — fall back to reading the file bytes and pasting them via the clipboard (`navigator.clipboard.write` with a `ClipboardItem`) through `browser_evaluate`.
+
+**If a native file picker DOES open** (you clicked the wrong thing): there is no recovery via CDP. Press Escape via `browser_press("Escape")` immediately — this dismisses the OS dialog in Chrome on Linux/macOS. Then find the actual `<input type='file'>` and use `browser_upload`.
+
 ## Common pitfalls

 - **Typing into a rich-text editor without clicking first → send button stays disabled.** Draft.js (X), Lexical (Gmail, LinkedIn DMs), ProseMirror (Reddit), and React-controlled `contenteditable` elements only register input as "real" when the element received a native focus event — JS-sourced `.focus()` is not enough. `browser_type` now does this automatically via a real CDP pointer click before inserting text, but always verify the submit button's `disabled` state before clicking send. See the "ALWAYS click before typing" section above.
@@ -354,6 +399,7 @@ After submitting, press Escape to close the composer.
 - **Relying on `innerHTML` in injected scripts on LinkedIn.** Silently discarded. Use `createElement` + `appendChild`.
 - **Not waiting for SPA hydration.** `wait_until="load"` fires before React/Vue rendering on many sites. Add a 2–3 s sleep before querying for chrome elements.
 - **Using `browser_type(selector)` on LinkedIn DMs or any shadow-DOM input.** Won't find the element. Fall back to click-to-focus + `browser_press` per character.
+- **Clicking a "Photo" / "Attach" / "Upload" button to pick a file.** This opens Chrome's NATIVE OS file picker, which is rendered outside the web page and cannot be interacted with via CDP. Your automation will hang staring at an unreachable dialog. ALWAYS use `browser_upload(selector, file_paths)` against the underlying `<input type='file'>` element — see the "File uploads" section above for the full pattern. This is the single most common way to wedge a browser session on compose-with-media flows (X/LinkedIn/Gmail).
 - **Keyboard shortcuts without the `code` field.** Chrome's shortcut dispatcher ignores keyboard events that lack a `code` or `windowsVirtualKeyCode`. `browser_press(..., modifiers=[...])` populates these automatically; raw `Input.dispatchKeyEvent` calls from `browser_evaluate` may not.
 - **Taking a screenshot more than 10s after the last interaction** and expecting the highlight to still be visible. The overlay fades after 10s. Take the screenshot sooner, or re-trigger the interaction.
 - **Expecting `browser_navigate` to return when you specified `wait_until="networkidle"` on a busy site.** networkidle is approximate — some sites keep a websocket or analytics beacon open forever. Use `"load"` or `"domcontentloaded"` for reliable timing.
@@ -246,6 +246,60 @@ if state['found'] and not state['disabled']:
    browser_click("button.share-actions__primary-action")
 ```

+## Posting WITH an image attached
+
+**Do NOT click the "Add media" / image icon inside the feed post composer to pick a file.** LinkedIn renders a styled button that opens Chrome's native OS file picker when clicked, and that dialog is unreachable via CDP — the automation will hang on an invisible modal. Use `browser_upload` directly against the hidden `<input type='file'>`:
+
+```python
+# After the post modal is open and the editor has text:
+# (A) First, click "Add media" to surface the file input
+#     (clicking THIS button reveals the input but does NOT itself open
+#     the OS picker on current LinkedIn — the picker only opens if
+#     you click the inner "Choose from your device" entry).
+media_btn = browser_get_rect("button[aria-label*='image'], button[aria-label*='photo']")
+browser_click_coordinate(media_btn.cx, media_btn.cy)
+sleep(0.8)
+
+# (B) Enumerate file inputs to find the right one
+inputs = browser_evaluate("""
+  (function(){
+    return Array.from(document.querySelectorAll('input[type="file"]'))
+      .map((el, i) => ({
+        idx: i,
+        accept: el.accept || '',
+        name: el.name || '',
+      }));
+  })();
+""")
+# Expect to see one with accept='image/*' or accept containing 'image/jpeg'
+
+# (C) Set the file programmatically — no dialog
+browser_upload(
+    selector="input[type='file'][accept*='image']",
+    file_paths=["/absolute/path/to/logo.png"],
+)
+sleep(3)  # LinkedIn shows an upload-progress bar + preview
+
+# (D) Verify the image preview rendered before clicking Post
+preview_ok = browser_evaluate("""
+  (function(){
+    // LinkedIn shows the preview as an <img> inside
+    // .share-creation-state__image-preview or similar.
+    return !!document.querySelector(
+      '.share-creation-state__preview img, .image-preview-container img'
+    );
+  })();
+""")
+if not preview_ok:
+    raise Exception("LinkedIn image upload did not render — do NOT click Post")
+
+# (E) Now click Post as usual
+browser_click("button.share-actions__primary-action")
+sleep(4)  # media post takes longer to commit than text-only
+```
+
+If the image isn't already on disk, write it first with `write_file(absolute_path, bytes)`. `browser_upload` only accepts absolute paths.
+
 ## Rate limits and safety

 LinkedIn's abuse detection is aggressive. Respect these limits:
@@ -79,6 +79,61 @@ if state['found'] and not state['disabled']:
    browser_press("Escape")  # close any leftover modal
 ```

+## Posting a tweet WITH an image
+
+**Critical: NEVER click the photo button.** On `x.com/compose/post` the media button is a styled `<button>` that triggers Chrome's native OS file picker when clicked — that dialog is unreachable via CDP and will wedge the automation. Instead, set the file directly on the hidden `<input type='file'>` element using `browser_upload`:
+
+```python
+# 1. Open the compose modal as usual
+browser_press("n")
+sleep(1.5)
+browser_click_coordinate(ta_rect.cx, ta_rect.cy)
+sleep(0.5)
+browser_type("[data-testid='tweetTextarea_0']", tweet_text)
+
+# 2. Find the hidden file input X uses for media uploads.
+#    X's input is marked with data-testid='fileInput' and accepts
+#    image/*,video/*. It's hidden (display:none) but still mounted.
+inputs = browser_evaluate("""
+  (function(){
+    return Array.from(document.querySelectorAll('input[type="file"]'))
+      .map(el => ({
+        testid: el.getAttribute('data-testid') || '',
+        accept: el.accept || '',
+        multiple: el.multiple,
+      }));
+  })();
+""")
+# Expect to see: [{testid: 'fileInput', accept: 'image/jpeg,...', multiple: true}]
+
+# 3. Set the file WITHOUT opening any dialog
+browser_upload(
+    selector="input[data-testid='fileInput']",
+    file_paths=["/absolute/path/to/photo.png"],
+)
+sleep(2)  # X takes ~1-2s to show the preview thumbnail
+
+# 4. Verify the preview rendered before posting — if not, the upload
+#    didn't land and Post button will fail.
+preview = browser_evaluate("""
+  (function(){
+    // X renders uploaded media as an <img> with data-testid='attachments'
+    // (or similar) inside the composer.
+    const att = document.querySelector('[data-testid="attachments"] img');
+    return { hasPreview: !!att };
+  })();
+""")
+if not preview['hasPreview']:
+    raise Exception("Upload didn't render in composer — do NOT click Post")
+
+# 5. Now click Post as usual
+browser_click("[data-testid='tweetButton']")
+sleep(3)  # media upload + post takes longer than text-only
+browser_press("Escape")
+```
+
+If you don't already have the image file on disk, write it first: `write_file("/tmp/x_upload.png", base64_bytes)` or copy from a known location. `browser_upload` requires an absolute file path — relative paths and `~` expansion are not supported.
+
 ## Reply to a post flow

 The reply flow is the same shape as posting, with a few scroll / find-and-click steps before.
@@ -42,6 +42,39 @@ BRIDGE_PORT = 9229
 # CDP wait_until values
 VALID_WAIT_UNTIL = {"commit", "domcontentloaded", "load", "networkidle"}

+# Fast-fail polling default for element / text waits. 5 seconds is long
+# enough to cover normal SPA render latency on loaded pages, short enough
+# that a bad selector or hallucinated element fails fast instead of
+# burning 30 wall-clock seconds per miss (the old behavior — see the
+# 2026-04-14 gemini-3-flash x.com session where 7 of 14 browser_click
+# calls each hit the 30s deadline for ~210s wasted total).
+#
+# navigate() keeps a longer default (30s) because real page loads can
+# legitimately take that long.
+DEFAULT_WAIT_TIMEOUT_MS: int = 5000
+
+# Longer default for bridge _send calls that wrap genuinely slow ops
+# (full-page screenshot, accessibility tree, navigate). Individual
+# callers can pass their own value via _send(..., timeout=...).
+_LONG_SEND_TIMEOUT_S: float = 60.0
+
+
+async def _adaptive_poll_sleep(elapsed_s: float) -> None:
+    """Sleep between DOM polls with an adaptive backoff.
+
+    Early polls are snappy (50ms) so a quickly-appearing element is
+    reported in ~100ms. Later polls back off (200ms, 500ms) so a
+    missing element doesn't thrash CDP with 300+ querySelector calls
+    before the deadline fires.
+    """
+    if elapsed_s < 1.0:
+        await asyncio.sleep(0.05)
+    elif elapsed_s < 5.0:
+        await asyncio.sleep(0.2)
+    else:
+        await asyncio.sleep(0.5)
+
+
 # Last interaction highlight per tab_id: {x, y, w, h, label, kind}
 # kind: "rect" (element) or "point" (coordinate)
 _interaction_highlights: dict[int, dict] = {}
@@ -296,9 +329,23 @@ class BeelineBridge:
        msg = str(exc).lower()
        return any(m in msg for m in self._CDP_DEAD_SESSION_MARKERS)

-    async def _cdp(self, tab_id: int, method: str, params: dict | None = None) -> dict:
+    async def _cdp(
+        self,
+        tab_id: int,
+        method: str,
+        params: dict | None = None,
+        *,
+        timeout: float | None = None,
+    ) -> dict:
        """Send a CDP command to a tab.

+        ``timeout`` (seconds) overrides the default bridge send timeout.
+        Pass a larger value for genuinely slow operations (full-page
+        screenshots over slow networks, accessibility tree on huge
+        pages) so they don't spuriously fail at the 30s floor. Pass a
+        smaller value for fast probes ("is this element present right
+        now") to fail fast.
+
        On a dead-session error (Chrome detached externally — tab closed,
        DevTools opened, cross-origin nav), evict the stale attach
        cache entry, reattach, and retry once. Without this the Python
@@ -307,7 +354,13 @@ class BeelineBridge:
        """
        start = time.perf_counter()
        try:
-            result = await self._send("cdp", tabId=tab_id, method=method, params=params or {})
+            result = await self._send(
+                "cdp",
+                tabId=tab_id,
+                method=method,
+                params=params or {},
+                timeout=timeout,
+            )
            duration_ms = (time.perf_counter() - start) * 1000
            log_cdp_command(tab_id, method, params, result, duration_ms=duration_ms)
            return result
@@ -327,7 +380,11 @@ class BeelineBridge:
                        self._cdp_attached.add(tab_id)
                        retry_start = time.perf_counter()
                        result = await self._send(
-                            "cdp", tabId=tab_id, method=method, params=params or {}
+                            "cdp",
+                            tabId=tab_id,
+                            method=method,
+                            params=params or {},
+                            timeout=timeout,
                        )
                        log_cdp_command(
                            tab_id,
@@ -594,10 +651,16 @@ class BeelineBridge:
        selector: str,
        button: str = "left",
        click_count: int = 1,
-        timeout_ms: int = 30000,
+        timeout_ms: int = DEFAULT_WAIT_TIMEOUT_MS,
    ) -> dict:
        """Click an element by selector.

+        ``timeout_ms`` controls how long we poll for the element to
+        appear in the DOM. Defaults to :data:`DEFAULT_WAIT_TIMEOUT_MS`
+        (5 s) so a missing or hallucinated selector fails fast. Pass a
+        larger value when the target genuinely needs longer to render
+        (e.g. post-navigation SPA hydration).
+
        Uses multiple fallback methods for robustness:
        1. CDP mouse events with JavaScript bounds
        2. JavaScript click() as fallback
@@ -612,8 +675,12 @@ class BeelineBridge:
        doc = await self._cdp(tab_id, "DOM.getDocument")
        root_id = doc.get("root", {}).get("nodeId")

-        # Wait for element to appear
-        deadline = asyncio.get_event_loop().time() + timeout_ms / 1000
+        # Wait for element to appear. Adaptive polling:
+        # - first 1 s at 50 ms intervals (responsive on fast pages)
+        # - next 4 s at 200 ms
+        # - rest at 500 ms
+        poll_start = asyncio.get_event_loop().time()
+        deadline = poll_start + timeout_ms / 1000
        node_id = None
        while asyncio.get_event_loop().time() < deadline:
            result = await self._cdp(
@@ -622,7 +689,7 @@ class BeelineBridge:
            node_id = result.get("nodeId")
            if node_id:
                break
-            await asyncio.sleep(0.1)
+            await _adaptive_poll_sleep(asyncio.get_event_loop().time() - poll_start)

        if not node_id:
            # Check if the element might be inside a Shadow DOM container
@@ -773,7 +840,11 @@ class BeelineBridge:
            )
            await asyncio.sleep(0.05)

-            # Mouse down
+            # Mouse down — if this hangs past the short wait budget we
+            # CANNOT claim success. The prior code swallowed TimeoutError
+            # with `pass` and returned ok=true further down, which is why
+            # the 2026-04-14 gemini session saw 7 clicks land at exactly
+            # 30s with status=ok even though the click had not landed.
            try:
                await asyncio.wait_for(
                    self._cdp(
@@ -787,14 +858,24 @@ class BeelineBridge:
                            "clickCount": click_count,
                        },
                    ),
-                    timeout=1.0,
+                    timeout=2.0,
                )
            except TimeoutError:
-                pass  # Continue even if timeout
+                return {
+                    "ok": False,
+                    "error": (
+                        f"CDP mousePressed timed out for '{selector}' — "
+                        "the click did not land. Consider browser_click_coordinate "
+                        "with an explicit rect from browser_get_rect."
+                    ),
+                }

            await asyncio.sleep(0.08)

-            # Mouse up
+            # Mouse up — same non-silent failure handling. A stuck
+            # mouseReleased means the press is still "held down" in
+            # Chrome's input state; we must surface the failure so the
+            # caller can retry or switch strategy.
            try:
                await asyncio.wait_for(
                    self._cdp(
@@ -811,7 +892,14 @@ class BeelineBridge:
                    timeout=3.0,
                )
            except TimeoutError:
-                pass  # Continue even if timeout
+                return {
+                    "ok": False,
+                    "error": (
+                        f"CDP mouseReleased timed out for '{selector}' — "
+                        "the press event fired but release did not. The page "
+                        "may be in a stuck input state; try browser_click_coordinate."
+                    ),
+                }

            w = bounds_value.get("width", 0)
            h = bounds_value.get("height", 0)
@@ -2174,7 +2262,19 @@ class BeelineBridge:
                        "scale": 1,
                    }

-                result = await self._cdp(tab_id, "Page.captureScreenshot", params)
+                # Pass the outer screenshot timeout budget to the
+                # underlying CDP call. Full-page screenshots over slow
+                # networks can legitimately take 20-40s; the default 30s
+                # _send floor used to make them fail spuriously right at
+                # the boundary. We give the CDP call the full timeout_s
+                # budget so the outer `asyncio.timeout(timeout_s)` is
+                # the only authority on how long we wait.
+                result = await self._cdp(
+                    tab_id,
+                    "Page.captureScreenshot",
+                    params,
+                    timeout=timeout_s,
+                )
                data = result.get("data")

                if not data:
@@ -2249,8 +2349,18 @@ class BeelineBridge:
            logger.error("Screenshot failed: %s", e)
            return {"ok": False, "error": str(e)}

-    async def wait_for_selector(self, tab_id: int, selector: str, timeout_ms: int = 30000) -> dict:
-        """Wait for an element to appear."""
+    async def wait_for_selector(
+        self,
+        tab_id: int,
+        selector: str,
+        timeout_ms: int = DEFAULT_WAIT_TIMEOUT_MS,
+    ) -> dict:
+        """Wait for an element to appear.
+
+        Default 5 s fast-fail. Callers that need to wait longer (e.g.
+        a known slow post-navigation render) should pass an explicit
+        ``timeout_ms``.
+        """
        await self.cdp_attach(tab_id)

        script = f"""
@@ -2259,7 +2369,8 @@ class BeelineBridge:
            }})()
        """

-        deadline = asyncio.get_event_loop().time() + timeout_ms / 1000
+        poll_start = asyncio.get_event_loop().time()
+        deadline = poll_start + timeout_ms / 1000
        while asyncio.get_event_loop().time() < deadline:
            result = await self._cdp(
                tab_id,
@@ -2272,12 +2383,21 @@ class BeelineBridge:
            found = (result or {}).get("result", {}).get("value", False)
            if found:
                return {"ok": True, "selector": selector}
-            await asyncio.sleep(0.1)
+            await _adaptive_poll_sleep(asyncio.get_event_loop().time() - poll_start)

        return {"ok": False, "error": f"Element not found within timeout: {selector}"}

-    async def wait_for_text(self, tab_id: int, text: str, timeout_ms: int = 30000) -> dict:
-        """Wait for text to appear on the page."""
+    async def wait_for_text(
+        self,
+        tab_id: int,
+        text: str,
+        timeout_ms: int = DEFAULT_WAIT_TIMEOUT_MS,
+    ) -> dict:
+        """Wait for text to appear on the page.
+
+        Default 5 s fast-fail. Same fast-fail rationale as
+        :meth:`wait_for_selector`.
+        """
        await self.cdp_attach(tab_id)

        script = f"""
@@ -2286,7 +2406,8 @@ class BeelineBridge:
            }})()
        """

-        deadline = asyncio.get_event_loop().time() + timeout_ms / 1000
+        poll_start = asyncio.get_event_loop().time()
+        deadline = poll_start + timeout_ms / 1000
        while asyncio.get_event_loop().time() < deadline:
            result = await self._cdp(
                tab_id,
@@ -2297,7 +2418,7 @@ class BeelineBridge:
            found = (result or {}).get("result", {}).get("value", False)
            if found:
                return {"ok": True, "text": text}
-            await asyncio.sleep(0.1)
+            await _adaptive_poll_sleep(asyncio.get_event_loop().time() - poll_start)

        return {"ok": False, "error": f"Text not found within timeout: {text}"}

@@ -28,7 +28,7 @@ def register_advanced_tools(mcp: FastMCP) -> None:
        text: str | None = None,
        tab_id: int | None = None,
        profile: str | None = None,
-        timeout_ms: int = 30000,
+        timeout_ms: int = 5000,
    ) -> dict:
        """
        Wait for a condition.
@@ -39,7 +39,13 @@ def register_advanced_tools(mcp: FastMCP) -> None:
            text: Wait for text to appear on page (optional)
            tab_id: Chrome tab ID (default: active tab)
            profile: Browser profile name (default: "default")
-            timeout_ms: Max wait time in ms (default: 30000)
+            timeout_ms: Max wait time in ms for the selector/text poll.
+                Default 5000ms (fast-fail). If the condition isn't met
+                within 5s the call returns {"ok": False, "error": ...}
+                and the agent can try a different approach instead of
+                burning 30s per miss. Pass a larger value (e.g. 15000)
+                only when you genuinely expect the element to take
+                longer than 5s to render.

        Returns:
            Dict with wait result
@@ -6,6 +6,7 @@ All operations go through the Beeline extension via CDP - no Playwright required

 from __future__ import annotations

+import asyncio
 import base64
 import io
 import json
@@ -277,9 +278,20 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            if annotate and target_tab in _interaction_highlights:
                highlights = [_interaction_highlights[target_tab]]

-            # Normalize to 800px wide and annotate
-            data, physical_scale, css_scale = _resize_and_annotate(
-                data, css_width, dpr=dpr, highlights=highlights, width=width
+            # Normalize to 800px wide and annotate. Offloaded to a
+            # thread because PIL Image.open/resize/ImageDraw/composite on
+            # a 2-megapixel PNG blocks for ~150-300ms of CPU — plenty to
+            # freeze the asyncio event loop and delay every concurrent
+            # tool call during a screenshot. The function is reentrant
+            # (fresh PIL Image per call, no shared state), so to_thread
+            # is safe.
+            data, physical_scale, css_scale = await asyncio.to_thread(
+                _resize_and_annotate,
+                data,
+                css_width,
+                dpr,
+                highlights,
+                width,
            )
            _screenshot_scales[target_tab] = physical_scale
            _screenshot_css_scales[target_tab] = css_scale
@@ -30,7 +30,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        profile: str | None = None,
        button: Literal["left", "right", "middle"] = "left",
        double_click: bool = False,
-        timeout_ms: int = 30000,
+        timeout_ms: int = 5000,
    ) -> dict:
        """
        Click an element on the page.
@@ -41,7 +41,13 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            profile: Browser profile name (default: "default")
            button: Mouse button to click (left, right, middle)
            double_click: Perform double-click (default: False)
-            timeout_ms: Timeout waiting for element (default: 30000)
+            timeout_ms: How long to poll for the element to appear in the
+                DOM before giving up. Default 5000ms (fast-fail). A missing
+                or hallucinated selector returns "Element not found" in
+                <=5s so the agent can try a different approach quickly.
+                Pass a larger value (e.g. 15000) ONLY when you know the
+                element will take longer than 5s to render — for example
+                right after a navigation that triggers slow hydration.

        Returns:
            Dict with click result and coordinates