fix: diagnostics

2026-04-19 12:52:04 -07:00
parent ddaafe0307
commit e55cea97ef
5 changed files with 436 additions and 53 deletions
@@ -57,7 +57,9 @@
      "mcp__gcu-tools__browser_type_focused",
      "mcp__gcu-tools__browser_wait",
      "Bash(python3 -c ' *)",
-      "Bash(python3 scripts/debug_queen_prompt.py independent)"
+      "Bash(python3 scripts/debug_queen_prompt.py independent)",
+      "Bash(curl -s --max-time 2 http://127.0.0.1:9230/status)",
+      "Bash(python3 -c \"import json, sys; print\\(json.loads\\(sys.stdin.read\\(\\)\\)['data']['content']\\)\")"
    ],
    "additionalDirectories": [
      "/home/timothy/.hive/skills/writing-hive-skills",
@@ -211,6 +211,34 @@ chrome.runtime.onMessage.addListener((msg, _sender, sendResponse) => {
 chrome.runtime.onInstalled.addListener(ensureOffscreen);
 chrome.runtime.onStartup.addListener(ensureOffscreen);

+// ---------------------------------------------------------------------------
+// CDP event forwarder — diagnostic channel
+// ---------------------------------------------------------------------------
+//
+// chrome.debugger.sendCommand (the cdp handler above) only responds to
+// requests. CDP also emits unsolicited EVENTS (Runtime.consoleAPICalled,
+// Page.frameResized, Target.targetInfoChanged, …) that the bridge doesn't
+// see today. Forward the narrow subset we're currently diagnosing so the
+// Python side can correlate viewport changes with page lifecycle events.
+// Filtered at the source to keep the wire slim.
+const FORWARDED_CDP_EVENTS = new Set([
+  "Runtime.consoleAPICalled",
+  "Page.lifecycleEvent",
+  "Page.frameResized",
+  "Page.frameNavigated",
+  "Target.targetInfoChanged",
+]);
+
+chrome.debugger.onEvent.addListener((source, method, params) => {
+  if (!FORWARDED_CDP_EVENTS.has(method)) return;
+  wsSend({
+    type: "cdp_event",
+    tabId: source.tabId,
+    method,
+    params: params ?? {},
+  });
+});
+
 // Periodic alarm keeps the service worker from being garbage-collected and
 // recreates the offscreen document if it was evicted.
 chrome.alarms.create("keepAlive", { periodInMinutes: 0.4 });
@@ -166,6 +166,41 @@ _HIT_ELEMENT_JS = """
 """


+# Diagnostic probe — installs viewport/visibility listeners on the page
+# and posts their observations through console.info so the CDP event
+# channel (Runtime.consoleAPICalled) forwards them to our telemetry.
+# Idempotent via ``window.__hive_vp_instrumented``.
+_HIVE_VP_PROBE_JS = """
+(function () {
+  if (window.__hive_vp_instrumented) return;
+  window.__hive_vp_instrumented = true;
+  function sample(kind) {
+    try {
+      console.info('[hive_vp]', JSON.stringify({
+        kind: kind,
+        innerWidth: window.innerWidth,
+        innerHeight: window.innerHeight,
+        visualW: window.visualViewport && window.visualViewport.width,
+        visualH: window.visualViewport && window.visualViewport.height,
+        docHidden: document.hidden,
+        visibilityState: document.visibilityState,
+        scrollX: window.scrollX,
+        scrollY: window.scrollY,
+        dpr: window.devicePixelRatio,
+        ts: Date.now()
+      }));
+    } catch (e) {}
+  }
+  sample('init');
+  window.addEventListener('resize', function () { sample('resize'); });
+  if (window.visualViewport) {
+    window.visualViewport.addEventListener('resize', function () { sample('visualResize'); });
+  }
+  document.addEventListener('visibilitychange', function () { sample('visibility'); });
+})();
+"""
+
+
 _FOCUSED_ELEMENT_JS = """
 (function() {
    function describe(el) {
@@ -368,6 +403,23 @@ class BeelineBridge:
                    log_connection_event("hello", {"version": msg.get("version")})
                    continue

+                if msg.get("type") == "cdp_event":
+                    # Unsolicited CDP event forwarded by the extension.
+                    # Narrow diagnostic channel — see FORWARDED_CDP_EVENTS
+                    # in browser-extension/background.js. We pick out
+                    # the [hive_vp] console probe as a structured
+                    # viewport_event telemetry entry and also log the
+                    # raw event for correlation with page lifecycle.
+                    try:
+                        self._handle_cdp_event(
+                            msg.get("tabId"),
+                            msg.get("method", ""),
+                            msg.get("params") or {},
+                        )
+                    except Exception:
+                        pass
+                    continue
+
                msg_id = msg.get("id")
                if msg_id and msg_id in self._pending:
                    fut = self._pending.pop(msg_id)
@@ -392,6 +444,84 @@ class BeelineBridge:
                        fut.cancel()
                self._pending.clear()

+    def _handle_cdp_event(self, tab_id: int | None, method: str, params: dict) -> None:
+        """Decode a CDP event forwarded from the extension and route it
+        to telemetry. Keep this method sync and best-effort — a bad
+        event must never break the bridge's read loop.
+
+        Runtime.consoleAPICalled with our ``[hive_vp]`` prefix is
+        split off as a structured ``viewport_event`` entry so the
+        reader can ``grep`` it without touching the raw console log.
+        All other forwarded events are logged verbatim under
+        ``cdp_event`` so we can correlate viewport changes with
+        lifecycle / resize / target-info events.
+        """
+        from .telemetry import write_log
+
+        if method == "Runtime.consoleAPICalled":
+            args = params.get("args") or []
+            first = args[0].get("value") if args and isinstance(args[0], dict) else None
+            payload = args[1].get("value") if len(args) >= 2 and isinstance(args[1], dict) else None
+
+            # Structured [hive_vp] viewport probe → viewport_event
+            if first == "[hive_vp]" and isinstance(payload, str):
+                try:
+                    parsed = json.loads(payload)
+                except Exception:
+                    parsed = {"_raw": payload}
+                write_log({
+                    "type": "viewport_event",
+                    "tab_id": tab_id,
+                    **parsed,
+                })
+                return
+
+            # Attach-time canary → attach_canary (proves extension
+            # forwarder is alive end-to-end).
+            if first == "[hive_attach_canary]" and isinstance(payload, str):
+                try:
+                    parsed = json.loads(payload)
+                except Exception:
+                    parsed = {"_raw": payload}
+                write_log({
+                    "type": "attach_canary",
+                    "tab_id": tab_id,
+                    **parsed,
+                })
+                return
+
+            # Everything else — keep a compact row so we can tell
+            # whether ANY console output is flowing through the
+            # pipe. Truncate each arg so a chatty page can't flood
+            # the log.
+            compact = []
+            for a in args[:4]:
+                if not isinstance(a, dict):
+                    continue
+                v = a.get("value")
+                if isinstance(v, str):
+                    compact.append(v[:120])
+                elif v is not None:
+                    compact.append(str(v)[:120])
+            write_log({
+                "type": "cdp_event",
+                "tab_id": tab_id,
+                "method": method,
+                "level": params.get("type"),
+                "args": compact,
+            })
+            return
+
+        # Other forwarded events (Page.lifecycleEvent, frameResized,
+        # frameNavigated, Target.targetInfoChanged) are rare and high
+        # signal — keep the full param dict but truncate strings.
+        write_log({
+            "type": "cdp_event",
+            "tab_id": tab_id,
+            "method": method,
+            "params": params,
+        })
+
    # Default wait on a bridge command. Callers with known-slow ops
    # (full-page screenshots on slow networks, AX tree on huge pages)
    # can pass a longer value via _send(..., timeout=...). Using the
@@ -594,12 +724,111 @@ class BeelineBridge:
        """Attach CDP debugger to a tab.

        Returns {"ok": bool}.
+
+        First-attach-per-tab triggers Chrome's "<extension> started
+        debugging this browser" infobar, which shrinks the layout
+        viewport by ~30–70 CSS px. The banner's commit is async from
+        the attach return, so a screenshot taken immediately after
+        can capture the pre-banner layout, leaving the viewport
+        cache stale until the next screenshot or
+        ``_ensure_viewport_size`` call. We wait a short grace here
+        and proactively prime the viewport cache with the settled
+        (post-banner) dimensions, so the very first coord-conversion
+        after attach already operates on the real frame.
        """
        if tab_id in self._cdp_attached:
            return {"ok": True, "attached": False, "message": "Already attached"}
        result = await self._send("cdp.attach", tabId=tab_id)
-        if result.get("ok"):
-            self._cdp_attached.add(tab_id)
+        if not result.get("ok"):
+            return result
+        self._cdp_attached.add(tab_id)
+        # Prime the viewport cache so the first coord-conversion
+        # after attach has a reasonable seed. Also install the
+        # diagnostic viewport-change probe ([hive_vp] console
+        # messages that stream through our CDP-event channel).
+        # Failures are silent — cache will heal on next screenshot
+        # or _ensure_viewport_size call.
+        try:
+            from .tools.inspection import _viewport_sizes
+
+            eval_res = await self._cdp(
+                tab_id,
+                "Runtime.evaluate",
+                {
+                    "expression": "({w: window.innerWidth, h: window.innerHeight})",
+                    "returnByValue": True,
+                },
+            )
+            inner = (eval_res or {}).get("result", {}).get("value") or {}
+            cw = int(float(inner.get("w") or 0))
+            ch = int(float(inner.get("h") or 0))
+            if cw > 0 and ch > 0:
+                _viewport_sizes[tab_id] = (cw, ch)
+        except Exception:
+            pass
+
+        # Runtime must be enabled for consoleAPICalled events to
+        # fire; Page must be enabled for frame* / lifecycle events
+        # to reach the extension. Page.setLifecycleEventsEnabled
+        # is the critical one — without it Chrome withholds the
+        # DOMContentLoaded / load / firstMeaningfulPaint stream.
+        # Each wrapped in try so a failure on one domain doesn't
+        # block the others.
+        try:
+            await self._cdp(tab_id, "Runtime.enable", {})
+        except Exception:
+            pass
+        try:
+            await self._cdp(tab_id, "Page.enable", {})
+        except Exception:
+            pass
+        try:
+            await self._cdp(tab_id, "Page.setLifecycleEventsEnabled", {"enabled": True})
+        except Exception:
+            pass
+
+        # [hive_vp] probe — install resize / visibility listeners on
+        # the page so Chrome tells us when the renderer sees a
+        # viewport change. Uses console.info as a cheap transport
+        # through CDP; filtered server-side by the cdp_event
+        # handler. Idempotent via __hive_vp_instrumented.
+        try:
+            await self._cdp(
+                tab_id,
+                "Runtime.evaluate",
+                {
+                    "expression": _HIVE_VP_PROBE_JS,
+                    "returnByValue": True,
+                    "awaitPromise": False,
+                },
+            )
+        except Exception:
+            pass
+
+        # Canary — emit a recognisable marker from the page so we
+        # can verify end-to-end (page → CDP → extension → bridge →
+        # telemetry) is wired. Should produce one ``cdp_event``
+        # with method=Runtime.consoleAPICalled whose args start
+        # ``[hive_attach_canary]``. Zero canary entries after a
+        # run means the extension forwarder is stale and the user
+        # needs to reload the Hive extension in chrome://extensions.
+        try:
+            await self._cdp(
+                tab_id,
+                "Runtime.evaluate",
+                {
+                    "expression": (
+                        "console.info('[hive_attach_canary]', "
+                        "JSON.stringify({tabId: "
+                        + str(tab_id) + ", ts: Date.now()}))"
+                    ),
+                    "returnByValue": True,
+                    "awaitPromise": False,
+                },
+            )
+        except Exception:
+            pass
+
        return result

    async def cdp_detach(self, tab_id: int) -> dict:
@@ -46,16 +46,17 @@ _screenshot_scales: dict[int, float] = {}


 def clear_tab_state(tab_ids) -> None:
-    """Drop cached screenshot scales for the given tab_ids.
+    """Drop cached screenshot scales and viewport sizes for the given tab_ids.

    Called when a tab closes or a profile's context is destroyed so stale
-    scale values can't bleed into a later tab that Chrome happens to assign
+    cache values can't bleed into a later tab that Chrome happens to assign
    the same id. Accepts a single id or any iterable.
    """
    if isinstance(tab_ids, int):
        tab_ids = (tab_ids,)
    for tid in tab_ids:
        _screenshot_scales.pop(tid, None)
+        _viewport_sizes.pop(tid, None)


 def _resize_and_annotate(
@@ -195,34 +196,71 @@ def _resize_and_annotate(
        return data, 1.0


-async def _ensure_viewport_size(tab_id: int) -> tuple[int, int]:
-    """Return ``(cssWidth, cssHeight)`` for ``tab_id``, populating the
-    cache via ``window.innerWidth`` / ``window.innerHeight`` on miss.
+async def _ensure_viewport_size(tab_id: int, _caller: str = "unknown") -> tuple[int, int]:
+    """Return ``(cssWidth, cssHeight)`` for ``tab_id``, always
+    refreshing from ``window.innerWidth`` / ``window.innerHeight``.

    Used by click / hover / press tools to turn fractional inputs
    (0..1) into CSS px, and by rect tools to turn CSS-px rects into
-    fractions. Degrades to ``(1, 1)`` if the bridge can't be queried
-    — that makes every coord an identity op, which is a safe no-op
-    (and preferable to crashing).
+    fractions.
+
+    Every call emits a ``viewport_sample`` telemetry entry so we
+    can build a timeline of Chrome's reported viewport across an
+    agent run — needed to diagnose the sessions where cssH changes
+    silently (no visible layout shift) between screenshot and
+    click. The entry records the live value, the cached value, and
+    the delta so the transition point is trivial to locate in
+    ``~/.hive/browser-logs/browser-YYYY-MM-DD.jsonl``.
+
+    Falls back to the cached value on evaluate failure, then to
+    ``(1, 1)`` if there's no cache — identity-op is a safe no-op.
    """
-    cached = _viewport_sizes.get(tab_id)
-    if cached is not None and cached[0] > 0 and cached[1] > 0:
-        return cached
    bridge = get_bridge()
+    cw = ch = 0
+    evaluate_error: str | None = None
    try:
        result = await bridge.evaluate(tab_id, "({w: window.innerWidth, h: window.innerHeight})")
        inner = (result or {}).get("result") or {}
        cw = int(float(inner.get("w") or 0))
        ch = int(float(inner.get("h") or 0))
-    except Exception:
-        cw, ch = 0, 0
+    except Exception as e:
+        evaluate_error = str(e)
+        cw = ch = 0
+
+    cached_before = _viewport_sizes.get(tab_id)
+
    if cw <= 0 or ch <= 0:
-        # Degraded: bridge didn't return viewport. Cache an identity
-        # so we don't retry on every call; corrects itself after the
-        # next successful browser_screenshot.
-        cw, ch = 1, 1
-    _viewport_sizes[tab_id] = (cw, ch)
-    return cw, ch
+        if cached_before is not None and cached_before[0] > 0 and cached_before[1] > 0:
+            result_cw, result_ch = cached_before
+        else:
+            result_cw, result_ch = 1, 1
+    else:
+        result_cw, result_ch = cw, ch
+        _viewport_sizes[tab_id] = (cw, ch)
+
+    try:
+        from ..telemetry import write_log
+        write_log({
+            "type": "viewport_sample",
+            "tab_id": tab_id,
+            "caller": _caller,
+            "live_w": cw,
+            "live_h": ch,
+            "cached_w": cached_before[0] if cached_before else None,
+            "cached_h": cached_before[1] if cached_before else None,
+            "deltaH_vs_cache": (
+                (ch - cached_before[1])
+                if (cached_before and ch > 0)
+                else None
+            ),
+            "returned_w": result_cw,
+            "returned_h": result_ch,
+            "evaluate_error": evaluate_error,
+        })
+    except Exception:
+        pass
+
+    return result_cw, result_ch


 def register_inspection_tools(mcp: FastMCP) -> None:
@@ -475,7 +513,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            return result

        rect = result["rect"]
-        cw, ch = await _ensure_viewport_size(target_tab)
+        cw, ch = await _ensure_viewport_size(target_tab, _caller="browser_shadow_query")
        cw_f = float(cw) if cw > 0 else 1.0
        ch_f = float(ch) if ch > 0 else 1.0
        return {
@@ -538,7 +576,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            return result

        rect = result["rect"]
-        cw, ch = await _ensure_viewport_size(target_tab)
+        cw, ch = await _ensure_viewport_size(target_tab, _caller="browser_get_rect")
        cw_f = float(cw) if cw > 0 else 1.0
        ch_f = float(ch) if ch > 0 else 1.0
        return {
@@ -7,11 +7,13 @@ All operations go through the Beeline extension via CDP - no Playwright required
 from __future__ import annotations

 import asyncio
+import json
 import logging
 import time
 from typing import Literal

 from fastmcp import FastMCP
+from mcp.types import ImageContent, TextContent

 from ..bridge import get_bridge
 from ..telemetry import log_tool_call
@@ -28,6 +30,57 @@ _AUTO_SNAPSHOT_SETTLE_S = 0.5
 AutoSnapshotMode = Literal["default", "simple", "interactive", "off"]


+def _text_only(result: dict) -> list:
+    """Wrap a dict result as a single-block MCP text response.
+
+    Used for early-error returns from coordinate interaction tools that
+    promise a list shape — keeps the result round-trippable through the
+    MCP transport without a fragile dict-vs-list union.
+    """
+    return [TextContent(type="text", text=json.dumps(result))]
+
+
+async def _build_visual_response(result: dict, bridge, target_tab: int | None) -> list:
+    """Wrap an interaction result and append an annotated post-action screenshot.
+
+    Every coordinate-based interaction (click / hover / press_at) goes
+    through here so the agent ALWAYS sees what the page looks like
+    immediately after — with the click marker overlaid — and can
+    self-correct on a near-miss in the same turn instead of issuing a
+    separate ``browser_screenshot`` call. The marker comes from
+    ``_interaction_highlights`` which is populated by ``highlight_point``
+    inside the bridge call, so it's guaranteed to be present here.
+
+    Degrades to text-only on any failure (action errored, no tab,
+    screenshot timed out) — never blocks the interaction itself.
+    """
+    text_block = TextContent(type="text", text=json.dumps(result))
+    if not result.get("ok") or target_tab is None or bridge is None:
+        return [text_block]
+    try:
+        from ..bridge import _interaction_highlights
+        from .inspection import _resize_and_annotate
+
+        shot = await bridge.screenshot(target_tab, full_page=False)
+        if not shot.get("ok"):
+            return [text_block]
+        highlights = (
+            [_interaction_highlights[target_tab]]
+            if target_tab in _interaction_highlights
+            else None
+        )
+        data, _ = await asyncio.to_thread(
+            _resize_and_annotate,
+            shot["data"],
+            shot.get("cssWidth", 0),
+            shot.get("devicePixelRatio", 1.0),
+            highlights,
+        )
+        return [text_block, ImageContent(type="image", data=data, mimeType="image/jpeg")]
+    except Exception:
+        return [text_block]
+
+
 async def _attach_snapshot(result: dict, bridge, target_tab: int, auto_snapshot_mode: str) -> dict:
    """If the interaction succeeded and the caller opted into auto-snapshot,
    wait for the page to settle and attach an accessibility snapshot under
@@ -139,7 +192,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        tab_id: int | None = None,
        profile: str | None = None,
        button: Literal["left", "right", "middle"] = "left",
-    ) -> dict:
+    ) -> list:
        """
        Click at a FRACTION of the viewport (0..1, 0..1).

@@ -155,6 +208,22 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        tiles, etc.). Proportional positions survive every such
        transform; pixel coords do not.

+        Precision floor: visual coordinate picking from a screenshot
+        is reliable to roughly **3 % of the viewport** (~25–50 CSS px
+        on a 1280×800 window). The y-axis tends to drift more than x
+        because vision models perceive vertical centres less
+        accurately. For targets smaller than that — narrow buttons,
+        checkboxes, dense rows, links — look up the rect with
+        ``browser_get_rect`` (selector-based) or ``browser_shadow_query``
+        (web-component) and pass ``rect.cx`` / ``rect.cy`` directly.
+
+        The response is a 2-block list: a JSON text block with the
+        click result, and a fresh annotated screenshot showing where
+        the click landed (red marker at the dispatched coord). Use
+        the screenshot to verify; if the marker is sitting on the
+        wrong element, retry with the rect-derived centre instead of
+        re-eyeballing.
+
        Args:
            x: X fraction of the viewport (0..1).
            y: Y fraction of the viewport (0..1).
@@ -163,9 +232,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            button: Mouse button to click (left, right, middle)

        Returns:
-            Dict with click result, including ``focused_element``
-            describing what the click focused. ``focused_element.rect``
-            is also in fractions.
+            List with two content blocks: TextContent(JSON of the
+            click result, including ``focused_element`` and its rect
+            in fractions) and ImageContent(annotated post-click
+            screenshot). Falls back to a single-block text-only
+            response on any error.
        """
        start = time.perf_counter()
        params = {"x": x, "y": y, "tab_id": tab_id, "profile": profile, "button": button}
@@ -174,19 +245,19 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        if not bridge or not bridge.is_connected:
            result = {"ok": False, "error": "Browser extension not connected"}
            log_tool_call("browser_click_coordinate", params, result=result)
-            return result
+            return _text_only(result)

        ctx = _get_context(profile)
        if not ctx:
            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
            log_tool_call("browser_click_coordinate", params, result=result)
-            return result
+            return _text_only(result)

        target_tab = tab_id or ctx.get("activeTabId")
        if target_tab is None:
            result = {"ok": False, "error": "No active tab"}
            log_tool_call("browser_click_coordinate", params, result=result)
-            return result
+            return _text_only(result)

        # Pixel-input guard: legitimate fractions live in [0, 1]. Allow a
        # small overshoot tolerance for edge targets.
@@ -202,12 +273,12 @@ def register_interaction_tools(mcp: FastMCP) -> None:
                ),
            }
            log_tool_call("browser_click_coordinate", params, result=result)
-            return result
+            return _text_only(result)

        try:
            from .inspection import _ensure_viewport_size

-            cw, ch = await _ensure_viewport_size(target_tab)
+            cw, ch = await _ensure_viewport_size(target_tab, _caller="browser_click_coordinate")
            css_x = x * cw
            css_y = y * ch
            click_result = await bridge.click_coordinate(target_tab, css_x, css_y, button=button)
@@ -217,7 +288,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
                result={**click_result, "cssWidth": cw, "cssHeight": ch},
                duration_ms=(time.perf_counter() - start) * 1000,
            )
-            return click_result
+            return await _build_visual_response(click_result, bridge, target_tab)
        except Exception as e:
            result = {"ok": False, "error": str(e)}
            log_tool_call(
@@ -226,7 +297,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
                error=e,
                duration_ms=(time.perf_counter() - start) * 1000,
            )
-            return result
+            return _text_only(result)

    @mcp.tool()
    async def browser_type(
@@ -558,7 +629,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        y: float,
        tab_id: int | None = None,
        profile: str | None = None,
-    ) -> dict:
+    ) -> list:
        """
        Hover at a FRACTION of the viewport (0..1, 0..1).

@@ -567,6 +638,10 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        ``x`` / ``y`` are fractions of the viewport (``0.5`` = center);
        the tool converts to CSS px internally.

+        Same precision-floor caveat as ``browser_click_coordinate``:
+        for sub-3 % targets, use rect-derived coords from
+        ``browser_get_rect`` / ``browser_shadow_query``.
+
        Args:
            x: X fraction of the viewport (0..1).
            y: Y fraction of the viewport (0..1).
@@ -574,7 +649,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            profile: Browser profile name (default: "default")

        Returns:
-            Dict with hover result
+            List with two content blocks: TextContent(JSON of the
+            hover result) and ImageContent(annotated post-hover
+            screenshot showing the cursor marker). Useful for
+            verifying tooltip / hover-state changes triggered. Falls
+            back to text-only on error.
        """
        start = time.perf_counter()
        params = {"x": x, "y": y, "tab_id": tab_id, "profile": profile}
@@ -583,19 +662,19 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        if not bridge or not bridge.is_connected:
            result = {"ok": False, "error": "Browser extension not connected"}
            log_tool_call("browser_hover_coordinate", params, result=result)
-            return result
+            return _text_only(result)

        ctx = _get_context(profile)
        if not ctx:
            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
            log_tool_call("browser_hover_coordinate", params, result=result)
-            return result
+            return _text_only(result)

        target_tab = tab_id or ctx.get("activeTabId")
        if target_tab is None:
            result = {"ok": False, "error": "No active tab"}
            log_tool_call("browser_hover_coordinate", params, result=result)
-            return result
+            return _text_only(result)

        if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
            result = {
@@ -603,12 +682,12 @@ def register_interaction_tools(mcp: FastMCP) -> None:
                "error": (f"Coords ({x}, {y}) look like pixels. This tool expects fractions 0..1 of the viewport."),
            }
            log_tool_call("browser_hover_coordinate", params, result=result)
-            return result
+            return _text_only(result)

        try:
            from .inspection import _ensure_viewport_size

-            cw, ch = await _ensure_viewport_size(target_tab)
+            cw, ch = await _ensure_viewport_size(target_tab, _caller="browser_hover_coordinate")
            hover_result = await bridge.hover_coordinate(target_tab, x * cw, y * ch)
            log_tool_call(
                "browser_hover_coordinate",
@@ -616,7 +695,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
                result=hover_result,
                duration_ms=(time.perf_counter() - start) * 1000,
            )
-            return hover_result
+            return await _build_visual_response(hover_result, bridge, target_tab)
        except Exception as e:
            result = {"ok": False, "error": str(e)}
            log_tool_call(
@@ -625,7 +704,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
                error=e,
                duration_ms=(time.perf_counter() - start) * 1000,
            )
-            return result
+            return _text_only(result)

    @mcp.tool()
    async def browser_press_at(
@@ -634,7 +713,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        key: str,
        tab_id: int | None = None,
        profile: str | None = None,
-    ) -> dict:
+    ) -> list:
        """
        Move mouse to a FRACTION of the viewport (0..1, 0..1), then press a key.

@@ -644,6 +723,10 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        ``x`` / ``y`` are fractions of the viewport; the tool converts
        to CSS px internally.

+        Same precision-floor caveat as ``browser_click_coordinate``:
+        for sub-3 % targets, use rect-derived coords from
+        ``browser_get_rect`` / ``browser_shadow_query``.
+
        Args:
            x: X fraction of the viewport (0..1).
            y: Y fraction of the viewport (0..1).
@@ -652,7 +735,10 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            profile: Browser profile name (default: "default")

        Returns:
-            Dict with press result
+            List with two content blocks: TextContent(JSON of the
+            press result) and ImageContent(annotated post-press
+            screenshot showing where the key was dispatched). Falls
+            back to text-only on error.
        """
        start = time.perf_counter()
        params = {"x": x, "y": y, "key": key, "tab_id": tab_id, "profile": profile}
@@ -661,19 +747,19 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        if not bridge or not bridge.is_connected:
            result = {"ok": False, "error": "Browser extension not connected"}
            log_tool_call("browser_press_at", params, result=result)
-            return result
+            return _text_only(result)

        ctx = _get_context(profile)
        if not ctx:
            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
            log_tool_call("browser_press_at", params, result=result)
-            return result
+            return _text_only(result)

        target_tab = tab_id or ctx.get("activeTabId")
        if target_tab is None:
            result = {"ok": False, "error": "No active tab"}
            log_tool_call("browser_press_at", params, result=result)
-            return result
+            return _text_only(result)

        if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
            result = {
@@ -681,12 +767,12 @@ def register_interaction_tools(mcp: FastMCP) -> None:
                "error": (f"Coords ({x}, {y}) look like pixels. This tool expects fractions 0..1 of the viewport."),
            }
            log_tool_call("browser_press_at", params, result=result)
-            return result
+            return _text_only(result)

        try:
            from .inspection import _ensure_viewport_size

-            cw, ch = await _ensure_viewport_size(target_tab)
+            cw, ch = await _ensure_viewport_size(target_tab, _caller="browser_press_at")
            press_result = await bridge.press_key_at(target_tab, x * cw, y * ch, key)
            log_tool_call(
                "browser_press_at",
@@ -694,7 +780,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
                result=press_result,
                duration_ms=(time.perf_counter() - start) * 1000,
            )
-            return press_result
+            return await _build_visual_response(press_result, bridge, target_tab)
        except Exception as e:
            result = {"ok": False, "error": str(e)}
            log_tool_call(
@@ -703,7 +789,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
                error=e,
                duration_ms=(time.perf_counter() - start) * 1000,
            )
-            return result
+            return _text_only(result)

    @mcp.tool()
    async def browser_select(