feat: fraction-based visual clicks

2026-04-16 22:36:41 -07:00
parent aba0ff07ba
commit 558813e7fa
7 changed files with 259 additions and 181 deletions
@@ -962,9 +962,9 @@ class BeelineBridge:
        """Read document.activeElement and return a compact descriptor.

        The JS returns ``rect`` fields in CSS px (they come straight
-        from ``getBoundingClientRect``). We scale them to screenshot
-        pixels here so the agent sees a rect in the same coord space
-        it passed to click / hover / press_at.
+        from ``getBoundingClientRect``). We convert them to fractions
+        of the viewport here so the agent sees a rect in the same
+        coord space it passed to click / hover / press_at.

        Returns None on any failure — never raises.
        """
@@ -973,20 +973,23 @@ class BeelineBridge:
            result = await self.evaluate(tab_id, _FOCUSED_ELEMENT_JS)
            info = (result or {}).get("result")
            if info and isinstance(info, dict) and isinstance(info.get("rect"), dict):
-                # Convert CSS px rect → screenshot px using the cached
-                # scale. Fall back to 1.0 if no screenshot has been
-                # taken yet on this tab.
-                from .tools.inspection import _screenshot_css_scales
+                from .tools.inspection import _viewport_sizes

-                scale = _screenshot_css_scales.get(tab_id, 1.0) or 1.0
-                if scale > 0 and scale != 1.0:
+                vp = _viewport_sizes.get(tab_id)
+                if vp and vp[0] > 0 and vp[1] > 0:
+                    cw, ch = float(vp[0]), float(vp[1])
                    r = info["rect"]
                    info["rect"] = {
-                        "x": round(r.get("x", 0) / scale, 1),
-                        "y": round(r.get("y", 0) / scale, 1),
-                        "width": round(r.get("width", 0) / scale, 1),
-                        "height": round(r.get("height", 0) / scale, 1),
+                        "x": round(r.get("x", 0) / cw, 4),
+                        "y": round(r.get("y", 0) / ch, 4),
+                        "width": round(r.get("width", 0) / cw, 4),
+                        "height": round(r.get("height", 0) / ch, 4),
                    }
+                else:
+                    # Degraded: cache missing (no screenshot taken
+                    # yet). Leave rect in CSS px and flag it so the
+                    # agent can tell.
+                    info["rectSpace"] = "css"
            return info
        except Exception:
            return None
@@ -256,12 +256,13 @@ def register_advanced_tools(mcp: FastMCP) -> None:
        try:
            result = await bridge.resize(target_tab, width, height)
            # Invalidate per-tab scale caches — CSS width changed, so the
-            # cached image→CSS multiplier is stale. Click / rect tools
-            # will re-query innerWidth on next use via _ensure_css_scale.
+            # cached viewport dimensions are stale. Click / rect tools
+            # will re-query innerWidth / innerHeight on next use via
+            # _ensure_viewport_size.
            try:
-                from .inspection import _screenshot_css_scales, _screenshot_scales
+                from .inspection import _screenshot_scales, _viewport_sizes

-                _screenshot_css_scales.pop(target_tab, None)
+                _viewport_sizes.pop(target_tab, None)
                _screenshot_scales.pop(target_tab, None)
            except Exception:
                pass
@@ -24,21 +24,25 @@ from .tabs import _get_context
 logger = logging.getLogger(__name__)


-# Fixed output width for all screenshots. Chosen well below Anthropic's
-# ~1568-px vision-API resize threshold so the image the server emits is
-# the SAME image (pixel-for-pixel) the LLM sees. That preserves
-# image_px == model_px, which is the cornerstone of the "LLM works in
-# screenshot pixels only" contract — all click/hover/press/rect tools
-# translate between image pixels and CSS pixels internally.
+# Fixed output width for all screenshots (bandwidth default). This
+# number does NOT affect coordinate semantics — click / hover / press
+# and rect tools all work in fractions of the viewport (0..1), which
+# are invariant to whatever resize / tile the vision API applies. The
+# 800 px width is simply small enough to keep JPEG payloads under
+# ~150 KB on typical UI screenshots.
 _SCREENSHOT_WIDTH = 800

-# Per-tab scale caches populated on every browser_screenshot and on
-# lazy-init inside the click tools. Both are ``image_px × scale =
-# target_px`` multipliers.
-# - _screenshot_scales[tab]      → physical scale (image → physical px, debug only)
-# - _screenshot_css_scales[tab]  → css scale      (image → CSS px, used for Input events)
+# Per-tab viewport-size cache populated on every browser_screenshot
+# and on lazy-init inside the click tools. Stores CSS-pixel viewport
+# dimensions (window.innerWidth / window.innerHeight). Click tools
+# multiply fractional inputs by these to get CSS coords before
+# dispatching CDP events; rect tools divide CSS-pixel DOM rects by
+# these to produce fractions for the agent.
+_viewport_sizes: dict[int, tuple[int, int]] = {}
+
+# Optional debug cache — physical-px scale per tab (orig_png_w /
+# _SCREENSHOT_WIDTH). Logged only; no consumer.
 _screenshot_scales: dict[int, float] = {}
-_screenshot_css_scales: dict[int, float] = {}


 def _resize_and_annotate(
@@ -46,27 +50,24 @@ def _resize_and_annotate(
    css_width: int,
    dpr: float = 1.0,
    highlights: list[dict] | None = None,
-) -> tuple[str, float, float]:
+) -> tuple[str, float]:
    """Resize the captured PNG down to ``_SCREENSHOT_WIDTH`` (=800 px)
    and re-encode as JPEG quality 75.

-    CDP captures at the physical-pixel resolution (DPR × CSS). We
-    downscale to 800 px wide so the delivered image stays under
-    Anthropic's vision-API resize cap — the model sees pixel-for-pixel
-    what we send.
+    The image dimensions do NOT determine click coordinates any more —
+    the tools work in viewport fractions. This helper exists purely
+    for bandwidth + annotation overlay. Returns ``(new_b64,
+    physical_scale)`` where ``physical_scale = orig_png_w / output_w``
+    is kept for debug logging.

-    Returns ``(new_b64, physical_scale, css_scale)`` where
-    - ``physical_scale = orig_png_w / _SCREENSHOT_WIDTH`` (image → physical px)
-    - ``css_scale      = css_width / _SCREENSHOT_WIDTH`` (image → CSS px)
-
-    Highlight rects arrive in CSS px and are divided by ``css_scale``
-    before drawing so overlays land in the correct spot on the
-    800-wide output.
+    Highlight rects arrive in CSS px; they're converted to image-space
+    for overlay drawing via the local ``css_to_image = css_width /
+    output_w`` factor (computed inline — no external cache).
    """
    if not css_width or css_width <= 0:
        # Bridge always supplies css_width from window.innerWidth; only
        # reach here on a degraded response. Return the raw PNG.
-        return data, 1.0, 1.0
+        return data, 1.0

    try:
        from PIL import Image, ImageDraw, ImageFont
@@ -78,17 +79,15 @@ def _resize_and_annotate(

            orig_w = struct.unpack(">I", raw[16:20])[0]
        physical_scale = orig_w / _SCREENSHOT_WIDTH if orig_w else 1.0
-        css_scale = css_width / _SCREENSHOT_WIDTH
        logger.warning(
            "PIL not available — screenshot resize SKIPPED. "
            "Returning raw physical-px PNG. physicalScale=%.4f, "
-            "cssScale=%.4f, css_width=%d, dpr=%s. Install Pillow for correct clicks.",
+            "css_width=%d, dpr=%s. Install Pillow for annotation.",
            physical_scale,
-            css_scale,
            css_width,
            dpr,
        )
-        return data, round(physical_scale, 4), round(css_scale, 4)
+        return data, round(physical_scale, 4)

    try:
        raw = base64.b64decode(data)
@@ -96,14 +95,17 @@ def _resize_and_annotate(
        orig_w, orig_h = img.size

        physical_scale = orig_w / _SCREENSHOT_WIDTH
-        css_scale = css_width / _SCREENSHOT_WIDTH
        new_w = _SCREENSHOT_WIDTH
        new_h = round(orig_h * new_w / orig_w)
        if (new_w, new_h) != img.size:
            img = img.resize((new_w, new_h), Image.LANCZOS)

+        # Local CSS → image px factor for overlay draws. Kept local —
+        # not exported, not stored, not leaked to the agent.
+        css_to_image = css_width / _SCREENSHOT_WIDTH
+
        logger.info(
-            "Screenshot: orig=%dx%d → out=%dx%d (css_width=%d, dpr=%s), physicalScale=%.4f, cssScale=%.4f",
+            "Screenshot: orig=%dx%d → out=%dx%d (css_width=%d, dpr=%s), physicalScale=%.4f, css_to_image=%.4f",
            orig_w,
            orig_h,
            new_w,
@@ -111,7 +113,7 @@ def _resize_and_annotate(
            css_width,
            dpr,
            physical_scale,
-            css_scale,
+            css_to_image,
        )

        if highlights:
@@ -126,10 +128,10 @@ def _resize_and_annotate(
                kind = h.get("kind", "rect")
                label = h.get("label", "")
                # Highlights arrive in CSS px → convert to image px.
-                ix = h["x"] / css_scale
-                iy = h["y"] / css_scale
-                iw = h.get("w", 0) / css_scale
-                ih = h.get("h", 0) / css_scale
+                ix = h["x"] / css_to_image
+                iy = h["y"] / css_to_image
+                iw = h.get("w", 0) / css_to_image
+                ih = h.get("h", 0) / css_to_image

                if kind == "point":
                    cx, cy, r = ix, iy, 10
@@ -169,7 +171,6 @@ def _resize_and_annotate(
        return (
            base64.b64encode(buf.getvalue()).decode(),
            round(physical_scale, 4),
-            round(css_scale, 4),
        )
    except Exception:
        logger.warning(
@@ -179,30 +180,37 @@ def _resize_and_annotate(
            dpr,
            exc_info=True,
        )
-        return data, 1.0, 1.0
+        return data, 1.0


-async def _ensure_css_scale(tab_id: int) -> float:
-    """Return the image→CSS scale for ``tab_id``, populating the cache
-    via ``window.innerWidth`` if missing. Used by click tools when the
-    agent clicks before the first screenshot has been taken.
+async def _ensure_viewport_size(tab_id: int) -> tuple[int, int]:
+    """Return ``(cssWidth, cssHeight)`` for ``tab_id``, populating the
+    cache via ``window.innerWidth`` / ``window.innerHeight`` on miss.
+
+    Used by click / hover / press tools to turn fractional inputs
+    (0..1) into CSS px, and by rect tools to turn CSS-px rects into
+    fractions. Degrades to ``(1, 1)`` if the bridge can't be queried
+    — that makes every coord an identity op, which is a safe no-op
+    (and preferable to crashing).
    """
-    cached = _screenshot_css_scales.get(tab_id)
-    if cached is not None and cached > 0:
+    cached = _viewport_sizes.get(tab_id)
+    if cached is not None and cached[0] > 0 and cached[1] > 0:
        return cached
    bridge = get_bridge()
    try:
-        result = await bridge.evaluate(tab_id, "({w: window.innerWidth})")
-        inner = float(((result or {}).get("result") or {}).get("w") or 0)
+        result = await bridge.evaluate(tab_id, "({w: window.innerWidth, h: window.innerHeight})")
+        inner = (result or {}).get("result") or {}
+        cw = int(float(inner.get("w") or 0))
+        ch = int(float(inner.get("h") or 0))
    except Exception:
-        inner = 0.0
-    if inner <= 0:
-        # Degraded: no viewport width available. Treat image px as CSS px.
-        scale = 1.0
-    else:
-        scale = inner / _SCREENSHOT_WIDTH
-    _screenshot_css_scales[tab_id] = scale
-    return scale
+        cw, ch = 0, 0
+    if cw <= 0 or ch <= 0:
+        # Degraded: bridge didn't return viewport. Cache an identity
+        # so we don't retry on every call; corrects itself after the
+        # next successful browser_screenshot.
+        cw, ch = 1, 1
+    _viewport_sizes[tab_id] = (cw, ch)
+    return cw, ch


 def register_inspection_tools(mcp: FastMCP) -> None:
@@ -219,22 +227,28 @@ def register_inspection_tools(mcp: FastMCP) -> None:
        """
        Take a screenshot of the current page.

-        Image is 800 px wide (JPEG quality 75, ~50–120 KB). A pixel you
-        see in this image is the same number you pass to
-        ``browser_click_coordinate`` / ``browser_hover_coordinate`` /
-        ``browser_press_at`` — the tools translate to CSS internally.
+        Image is 800 px wide (JPEG quality 75, ~50–120 KB). All
+        coordinate tools work in **fractions of the viewport (0..1)**,
+        not pixels — so read a target's proportional position off this
+        image ("~35 % from the left, ~20 % from the top") and pass
+        ``(0.35, 0.20)`` to ``browser_click_coordinate`` /
+        ``browser_hover_coordinate`` / ``browser_press_at``.
        ``browser_get_rect`` and ``browser_shadow_query`` likewise
-        return coordinates in screenshot pixels.
+        return coordinates as fractions.

        Args:
            tab_id: Chrome tab ID (default: active tab)
            profile: Browser profile name (default: "default")
-            full_page: Capture full scrollable page (default: False)
+            full_page: Capture full scrollable page (default: False).
+                Note: full_page images extend beyond the viewport, so
+                fractions read off them do NOT map cleanly to
+                viewport-space clicks. Use for reading / overview only,
+                not for pointing.
            selector: CSS selector to screenshot a specific element (optional)
            annotate: Draw bounding box of last interaction on image (default: True)

        Returns:
-            List of content blocks: text metadata + image
+            List of content blocks: text metadata + image.
        """
        start = time.perf_counter()
        params = {
@@ -299,18 +313,20 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            # Image.open/resize/ImageDraw/composite on a 2-megapixel
            # PNG blocks for ~150–300 ms of CPU — plenty to freeze the
            # asyncio event loop. Reentrant: no shared state.
-            data, physical_scale, css_scale = await asyncio.to_thread(
+            data, physical_scale = await asyncio.to_thread(
                _resize_and_annotate,
                data,
                css_width,
                dpr,
                highlights,
            )
-            # Refresh caches so click / hover / press / rect tools can
-            # translate image px ↔ CSS px without asking the page again.
-            if target_tab is not None:
+            # Cache live viewport dimensions so click / hover / press /
+            # rect tools can translate fractions ↔ CSS px without
+            # asking the page again.
+            css_height = int(screenshot_result.get("cssHeight", 0)) or 0
+            if target_tab is not None and css_width > 0 and css_height > 0:
+                _viewport_sizes[target_tab] = (int(css_width), css_height)
                _screenshot_scales[target_tab] = physical_scale
-                _screenshot_css_scales[target_tab] = css_scale

            meta = json.dumps(
                {
@@ -321,18 +337,21 @@ def register_inspection_tools(mcp: FastMCP) -> None:
                    "size": len(base64.b64decode(data)) if data else 0,
                    "imageWidth": _SCREENSHOT_WIDTH,
                    "cssWidth": css_width,
+                    "cssHeight": css_height,
                    "fullPage": full_page,
                    "devicePixelRatio": dpr,
                    "physicalScale": physical_scale,
-                    "cssScale": css_scale,
                    "annotated": bool(highlights),
                    "scaleHint": (
-                        "Image is 800 px wide. Pass pixel coordinates "
-                        "you read off this image straight into "
+                        "Coordinates for click / hover / press are "
+                        "fractions 0..1 of the viewport. Read a "
+                        "target's proportional position off this image "
+                        "(e.g. '~35 % from the left, ~20 % from the top' "
+                        "→ (0.35, 0.20)) and pass that to "
                        "browser_click_coordinate / "
-                        "browser_hover_coordinate / browser_press_at — "
-                        "the tools translate image px → CSS px "
-                        "internally (cssScale is for debug only)."
+                        "browser_hover_coordinate / browser_press_at. "
+                        "browser_get_rect / browser_shadow_query / "
+                        "focused_element.rect return fractions too."
                    ),
                }
            )
@@ -345,7 +364,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
                    "size": len(base64.b64decode(data)) if data else 0,
                    "url": screenshot_result.get("url", ""),
                    "cssWidth": css_width,
-                    "cssScale": css_scale,
+                    "cssHeight": css_height,
                    "physicalScale": physical_scale,
                    "dpr": dpr,
                },
@@ -376,9 +395,9 @@ def register_inspection_tools(mcp: FastMCP) -> None:

        Traverses shadow roots to find elements inside closed/open shadow DOM,
        overlays, and virtual-rendered components (e.g. LinkedIn's #interop-outlet).
-        Returns the element's bounding rect in screenshot pixels — feed
-        ``rect.cx`` / ``rect.cy`` straight into browser_click_coordinate
-        / hover_coordinate / press_at.
+        Returns the element's bounding rect as **fractions of the
+        viewport (0..1)** — feed ``rect.cx`` / ``rect.cy`` straight
+        into browser_click_coordinate / hover_coordinate / press_at.

        Args:
            selector: CSS selectors joined by ' >>> ' to pierce shadow roots.
@@ -387,7 +406,8 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            profile: Browser profile name (default: "default")

        Returns:
-            Dict with ``rect`` block (x, y, w, h, cx, cy) in screenshot pixels.
+            Dict with ``rect`` block (x, y, w, h, cx, cy) as fractions,
+            plus ``cssWidth`` / ``cssHeight`` for reference.
        """
        bridge = get_bridge()
        if not bridge or not bridge.is_connected:
@@ -404,23 +424,26 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            return result

        rect = result["rect"]
-        css_scale = await _ensure_css_scale(target_tab)
-        s = css_scale if css_scale > 0 else 1.0
+        cw, ch = await _ensure_viewport_size(target_tab)
+        cw_f = float(cw) if cw > 0 else 1.0
+        ch_f = float(ch) if ch > 0 else 1.0
        return {
            "ok": True,
            "selector": selector,
            "tag": rect.get("tag"),
            "rect": {
-                "x": round(rect["x"] / s, 1),
-                "y": round(rect["y"] / s, 1),
-                "w": round(rect["w"] / s, 1),
-                "h": round(rect["h"] / s, 1),
-                "cx": round(rect["cx"] / s, 1),
-                "cy": round(rect["cy"] / s, 1),
+                "x": round(rect["x"] / cw_f, 4),
+                "y": round(rect["y"] / ch_f, 4),
+                "w": round(rect["w"] / cw_f, 4),
+                "h": round(rect["h"] / ch_f, 4),
+                "cx": round(rect["cx"] / cw_f, 4),
+                "cy": round(rect["cy"] / ch_f, 4),
            },
+            "cssWidth": cw,
+            "cssHeight": ch,
            "note": (
-                "rect fields are in screenshot pixels. Pass rect.cx / "
-                "rect.cy to browser_click_coordinate / "
+                "rect fields are fractions of the viewport (0..1). "
+                "Pass rect.cx / rect.cy to browser_click_coordinate / "
                "hover_coordinate / press_at."
            ),
        }
@@ -435,9 +458,9 @@ def register_inspection_tools(mcp: FastMCP) -> None:
        Get the bounding rect of an element by CSS selector.

        Supports '>>>' shadow-piercing selectors for overlay/shadow DOM
-        content. Returns the rect in screenshot pixels — the same
-        numbers you'd read off a browser_screenshot, and the same
-        numbers browser_click_coordinate expects.
+        content. Returns the rect as **fractions of the viewport
+        (0..1)** — the same coordinate space browser_click_coordinate
+        / hover_coordinate / press_at expect.

        Args:
            selector: CSS selector, optionally with ' >>> ' to pierce shadow roots.
@@ -446,7 +469,8 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            profile: Browser profile name (default: "default")

        Returns:
-            Dict with ``rect`` block (x, y, w, h, cx, cy) in screenshot pixels.
+            Dict with ``rect`` block (x, y, w, h, cx, cy) as fractions,
+            plus ``cssWidth`` / ``cssHeight`` for reference.
        """
        bridge = get_bridge()
        if not bridge or not bridge.is_connected:
@@ -463,23 +487,26 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            return result

        rect = result["rect"]
-        css_scale = await _ensure_css_scale(target_tab)
-        s = css_scale if css_scale > 0 else 1.0
+        cw, ch = await _ensure_viewport_size(target_tab)
+        cw_f = float(cw) if cw > 0 else 1.0
+        ch_f = float(ch) if ch > 0 else 1.0
        return {
            "ok": True,
            "selector": selector,
            "tag": rect.get("tag"),
            "rect": {
-                "x": round(rect["x"] / s, 1),
-                "y": round(rect["y"] / s, 1),
-                "w": round(rect["w"] / s, 1),
-                "h": round(rect["h"] / s, 1),
-                "cx": round(rect["cx"] / s, 1),
-                "cy": round(rect["cy"] / s, 1),
+                "x": round(rect["x"] / cw_f, 4),
+                "y": round(rect["y"] / ch_f, 4),
+                "w": round(rect["w"] / cw_f, 4),
+                "h": round(rect["h"] / ch_f, 4),
+                "cx": round(rect["cx"] / cw_f, 4),
+                "cy": round(rect["cy"] / ch_f, 4),
            },
+            "cssWidth": cw,
+            "cssHeight": ch,
            "note": (
-                "rect fields are in screenshot pixels. Pass rect.cx / "
-                "rect.cy to browser_click_coordinate / "
+                "rect fields are fractions of the viewport (0..1). "
+                "Pass rect.cx / rect.cy to browser_click_coordinate / "
                "hover_coordinate / press_at."
            ),
        }
@@ -108,25 +108,31 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        button: Literal["left", "right", "middle"] = "left",
    ) -> dict:
        """
-        Click at the given SCREENSHOT pixel.
+        Click at a FRACTION of the viewport (0..1, 0..1).

-        ``x`` and ``y`` are pixel coordinates read directly off a
-        ``browser_screenshot`` image (800 px wide JPEG). The tool
-        multiplies them by the cached image→CSS scale for the tab
-        before dispatching to Chrome — no scale awareness required on
-        the caller side. ``browser_get_rect`` / ``browser_shadow_query``
-        return coordinates in the same (screenshot) space.
+        Coordinates are **fractions of the viewport**, not pixels:
+        ``(0.5, 0.5)`` is the center, ``(0.1, 0.2)`` is 10 % from the
+        left and 20 % from the top. Read a target's proportional
+        position off ``browser_screenshot`` (or pass
+        ``rect.cx`` / ``rect.cy`` from ``browser_get_rect`` /
+        ``browser_shadow_query`` directly — they return fractions too).
+
+        Fractions are used because every vision model resizes or tiles
+        images differently (Claude ~1.15 MP target, GPT-4o 512-px
+        tiles, etc.). Proportional positions survive every such
+        transform; pixel coords do not.

        Args:
-            x: X coordinate in screenshot pixels.
-            y: Y coordinate in screenshot pixels.
+            x: X fraction of the viewport (0..1).
+            y: Y fraction of the viewport (0..1).
            tab_id: Chrome tab ID (default: active tab)
            profile: Browser profile name (default: "default")
            button: Mouse button to click (left, right, middle)

        Returns:
            Dict with click result, including ``focused_element``
-            describing what the click focused.
+            describing what the click focused. ``focused_element.rect``
+            is also in fractions.
        """
        start = time.perf_counter()
        params = {"x": x, "y": y, "tab_id": tab_id, "profile": profile, "button": button}
@@ -149,18 +155,33 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            log_tool_call("browser_click_coordinate", params, result=result)
            return result

-        try:
-            from .inspection import _ensure_css_scale
+        # Pixel-input guard: legitimate fractions live in [0, 1]. Allow a
+        # small overshoot tolerance for edge targets.
+        if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
+            result = {
+                "ok": False,
+                "error": (
+                    f"Coords ({x}, {y}) look like pixels. This tool expects "
+                    "fractions 0..1 of the viewport. Read the target's "
+                    "proportional position off browser_screenshot, or pass "
+                    "rect.cx / rect.cy from browser_get_rect / "
+                    "browser_shadow_query (they return fractions)."
+                ),
+            }
+            log_tool_call("browser_click_coordinate", params, result=result)
+            return result

-            css_scale = await _ensure_css_scale(target_tab)
-            s = css_scale if css_scale > 0 else 1.0
-            css_x = x * s
-            css_y = y * s
+        try:
+            from .inspection import _ensure_viewport_size
+
+            cw, ch = await _ensure_viewport_size(target_tab)
+            css_x = x * cw
+            css_y = y * ch
            click_result = await bridge.click_coordinate(target_tab, css_x, css_y, button=button)
            log_tool_call(
                "browser_click_coordinate",
                params,
-                result={**click_result, "cssScale": round(css_scale, 4)},
+                result={**click_result, "cssWidth": cw, "cssHeight": ch},
                duration_ms=(time.perf_counter() - start) * 1000,
            )
            return click_result
@@ -485,17 +506,16 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        profile: str | None = None,
    ) -> dict:
        """
-        Hover at the given SCREENSHOT pixel.
+        Hover at a FRACTION of the viewport (0..1, 0..1).

        Use this instead of browser_hover when the element is in an overlay,
        shadow DOM, or virtual-rendered component that isn't in the regular DOM.
-        ``x`` / ``y`` are pixel coordinates read directly off a
-        ``browser_screenshot`` image; the tool translates to CSS px
-        internally before dispatching to Chrome.
+        ``x`` / ``y`` are fractions of the viewport (``0.5`` = center);
+        the tool converts to CSS px internally.

        Args:
-            x: X coordinate in screenshot pixels.
-            y: Y coordinate in screenshot pixels.
+            x: X fraction of the viewport (0..1).
+            y: Y fraction of the viewport (0..1).
            tab_id: Chrome tab ID (default: active tab)
            profile: Browser profile name (default: "default")

@@ -523,12 +543,22 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            log_tool_call("browser_hover_coordinate", params, result=result)
            return result

-        try:
-            from .inspection import _ensure_css_scale
+        if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
+            result = {
+                "ok": False,
+                "error": (
+                    f"Coords ({x}, {y}) look like pixels. This tool expects "
+                    "fractions 0..1 of the viewport."
+                ),
+            }
+            log_tool_call("browser_hover_coordinate", params, result=result)
+            return result

-            css_scale = await _ensure_css_scale(target_tab)
-            s = css_scale if css_scale > 0 else 1.0
-            hover_result = await bridge.hover_coordinate(target_tab, x * s, y * s)
+        try:
+            from .inspection import _ensure_viewport_size
+
+            cw, ch = await _ensure_viewport_size(target_tab)
+            hover_result = await bridge.hover_coordinate(target_tab, x * cw, y * ch)
            log_tool_call(
                "browser_hover_coordinate",
                params,
@@ -555,18 +585,17 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        profile: str | None = None,
    ) -> dict:
        """
-        Move mouse to the given SCREENSHOT pixel, then press a key.
+        Move mouse to a FRACTION of the viewport (0..1, 0..1), then press a key.

        Use this instead of browser_press when the focused element is in an overlay
        or virtual-rendered component. Moving the mouse first routes the key event
        through native browser hit-testing instead of the DOM focus chain.
-        ``x`` / ``y`` are pixel coordinates read directly off a
-        ``browser_screenshot`` image; the tool translates to CSS px
-        internally.
+        ``x`` / ``y`` are fractions of the viewport; the tool converts
+        to CSS px internally.

        Args:
-            x: X coordinate in screenshot pixels.
-            y: Y coordinate in screenshot pixels.
+            x: X fraction of the viewport (0..1).
+            y: Y fraction of the viewport (0..1).
            key: Key to press (e.g. 'Enter', 'Space', 'Escape', 'ArrowDown')
            tab_id: Chrome tab ID (default: active tab)
            profile: Browser profile name (default: "default")
@@ -595,12 +624,22 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            log_tool_call("browser_press_at", params, result=result)
            return result

-        try:
-            from .inspection import _ensure_css_scale
+        if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
+            result = {
+                "ok": False,
+                "error": (
+                    f"Coords ({x}, {y}) look like pixels. This tool expects "
+                    "fractions 0..1 of the viewport."
+                ),
+            }
+            log_tool_call("browser_press_at", params, result=result)
+            return result

-            css_scale = await _ensure_css_scale(target_tab)
-            s = css_scale if css_scale > 0 else 1.0
-            press_result = await bridge.press_key_at(target_tab, x * s, y * s, key)
+        try:
+            from .inspection import _ensure_viewport_size
+
+            cw, ch = await _ensure_viewport_size(target_tab)
+            press_result = await bridge.press_key_at(target_tab, x * cw, y * ch, key)
            log_tool_call(
                "browser_press_at",
                params,