fix: model invariant screenshot

2026-04-16 20:29:05 -07:00
parent c6b6a5a2f7
commit aba0ff07ba
11 changed files with 320 additions and 323 deletions
@@ -80,33 +80,57 @@ async def _adaptive_poll_sleep(elapsed_s: float) -> None:
 _interaction_highlights: dict[int, dict] = {}


-# Compact descriptor of document.activeElement. Returned by both click()
+# Compact descriptor of the focused element. Returned by both click()
 # and click_coordinate() so the agent can verify it focused what it
-# intended, then decide whether to follow up with browser_type_focused(text=...).
-# Keeping this as a single shared string avoids drift
-# between the two click paths.
+# intended. When the outer document's activeElement is an <iframe>,
+# we recurse into the iframe's document (same-origin only) so the
+# response describes the real inner element — otherwise the agent
+# always sees {tag: "iframe"} and can't tell whether it hit the
+# composer or something else inside the frame (e.g. a sidebar item
+# in LinkedIn's #interop-outlet messaging overlay).
 _FOCUSED_ELEMENT_JS = """
 (function() {
+    function describe(el) {
+        var rect = el.getBoundingClientRect();
+        var attrs = {};
+        for (var i = 0; i < el.attributes.length && i < 10; i++) {
+            attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
+        }
+        return {
+            tag: el.tagName.toLowerCase(),
+            id: el.id || null,
+            className: el.className || null,
+            name: el.getAttribute('name') || null,
+            type: el.getAttribute('type') || null,
+            role: el.getAttribute('role') || null,
+            contenteditable: el.getAttribute('contenteditable') || null,
+            text: (el.innerText || '').substring(0, 200),
+            value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
+            attributes: attrs,
+            rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
+        };
+    }
    var el = document.activeElement;
    if (!el || el === document.body) return null;
-    var rect = el.getBoundingClientRect();
-    var attrs = {};
-    for (var i = 0; i < el.attributes.length && i < 10; i++) {
-        attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
+    // Descend into same-origin iframes. Capped at 5 levels of
+    // nesting to bound cost. Cross-origin frames throw on
+    // contentDocument access → we catch and report the outermost
+    // iframe instead.
+    var framePath = [];
+    var depth = 0;
+    while (el && (el.tagName === 'IFRAME' || el.tagName === 'FRAME') && depth < 5) {
+        framePath.push(el.id || el.getAttribute('data-testid') || el.tagName.toLowerCase());
+        var innerDoc = null;
+        try { innerDoc = el.contentDocument; } catch (e) { innerDoc = null; }
+        if (!innerDoc) break;
+        var innerActive = innerDoc.activeElement;
+        if (!innerActive || innerActive === innerDoc.body) break;
+        el = innerActive;
+        depth++;
    }
-    return {
-        tag: el.tagName.toLowerCase(),
-        id: el.id || null,
-        className: el.className || null,
-        name: el.getAttribute('name') || null,
-        type: el.getAttribute('type') || null,
-        role: el.getAttribute('role') || null,
-        contenteditable: el.getAttribute('contenteditable') || null,
-        text: (el.innerText || '').substring(0, 200),
-        value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
-        attributes: attrs,
-        rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
-    };
+    var out = describe(el);
+    if (framePath.length) out.inFrame = framePath;
+    return out;
 })()
 """

@@ -937,16 +961,33 @@ class BeelineBridge:
    async def _read_focused_element(self, tab_id: int) -> dict | None:
        """Read document.activeElement and return a compact descriptor.

-        Returns None on any failure — never raises. Used by both click
-        paths (selector-based click() and click_coordinate()) so the
-        agent gets the same response shape regardless of which one was
-        called. The descriptor lets the agent answer "did my click land
-        on an editable?" without a second round-trip.
+        The JS returns ``rect`` fields in CSS px (they come straight
+        from ``getBoundingClientRect``). We scale them to screenshot
+        pixels here so the agent sees a rect in the same coord space
+        it passed to click / hover / press_at.
+
+        Returns None on any failure — never raises.
        """
        try:
            await self._try_enable_domain(tab_id, "Runtime")
            result = await self.evaluate(tab_id, _FOCUSED_ELEMENT_JS)
-            return (result or {}).get("result")
+            info = (result or {}).get("result")
+            if info and isinstance(info, dict) and isinstance(info.get("rect"), dict):
+                # Convert CSS px rect → screenshot px using the cached
+                # scale. Fall back to 1.0 if no screenshot has been
+                # taken yet on this tab.
+                from .tools.inspection import _screenshot_css_scales
+
+                scale = _screenshot_css_scales.get(tab_id, 1.0) or 1.0
+                if scale > 0 and scale != 1.0:
+                    r = info["rect"]
+                    info["rect"] = {
+                        "x": round(r.get("x", 0) / scale, 1),
+                        "y": round(r.get("y", 0) / scale, 1),
+                        "width": round(r.get("width", 0) / scale, 1),
+                        "height": round(r.get("height", 0) / scale, 1),
+                    }
+            return info
        except Exception:
            return None

@@ -959,18 +1000,11 @@ class BeelineBridge:
        button_map = {"left": "left", "right": "right", "middle": "middle"}
        cdp_button = button_map.get(button, "left")

-        from .tools.inspection import _screenshot_css_scales, _screenshot_scales
-
-        phys_scale = _screenshot_scales.get(tab_id, "unset")
-        css_scale = _screenshot_css_scales.get(tab_id, "unset")
        logger.info(
-            "click_coordinate tab=%d: x=%.1f, y=%.1f → CDP Input.dispatchMouseEvent. "
-            "stored_scales: physicalScale=%s, cssScale=%s",
+            "click_coordinate tab=%d: x=%.1f, y=%.1f → CDP Input.dispatchMouseEvent",
            tab_id,
            x,
            y,
-            phys_scale,
-            css_scale,
        )

        await self._cdp(
@@ -255,6 +255,16 @@ def register_advanced_tools(mcp: FastMCP) -> None:

        try:
            result = await bridge.resize(target_tab, width, height)
+            # Invalidate per-tab scale caches — CSS width changed, so the
+            # cached image→CSS multiplier is stale. Click / rect tools
+            # will re-query innerWidth on next use via _ensure_css_scale.
+            try:
+                from .inspection import _screenshot_css_scales, _screenshot_scales
+
+                _screenshot_css_scales.pop(target_tab, None)
+                _screenshot_scales.pop(target_tab, None)
+            except Exception:
+                pass
            return result
        except Exception as e:
            return {"ok": False, "error": str(e)}
@@ -23,12 +23,21 @@ from .tabs import _get_context

 logger = logging.getLogger(__name__)

-# Target width for normalized screenshots (px in the delivered image)
-_SCREENSHOT_WIDTH = 600

-# Maps tab_id -> physical scale: image_coord × scale = physical pixels (for CDP Input events)
+# Fixed output width for all screenshots. Chosen well below Anthropic's
+# ~1568-px vision-API resize threshold so the image the server emits is
+# the SAME image (pixel-for-pixel) the LLM sees. That preserves
+# image_px == model_px, which is the cornerstone of the "LLM works in
+# screenshot pixels only" contract — all click/hover/press/rect tools
+# translate between image pixels and CSS pixels internally.
+_SCREENSHOT_WIDTH = 800
+
+# Per-tab scale caches populated on every browser_screenshot and on
+# lazy-init inside the click tools. Both are ``image_px × scale =
+# target_px`` multipliers.
+# - _screenshot_scales[tab]      → physical scale (image → physical px, debug only)
+# - _screenshot_css_scales[tab]  → css scale      (image → CSS px, used for Input events)
 _screenshot_scales: dict[int, float] = {}
-# Maps tab_id -> CSS scale: image_coord × scale = CSS pixels (for DOM APIs / getBoundingClientRect)
 _screenshot_css_scales: dict[int, float] = {}


@@ -37,18 +46,28 @@ def _resize_and_annotate(
    css_width: int,
    dpr: float = 1.0,
    highlights: list[dict] | None = None,
-    width: int = _SCREENSHOT_WIDTH,
 ) -> tuple[str, float, float]:
-    """Resize a base64 PNG to _SCREENSHOT_WIDTH wide, annotate highlights.
+    """Resize the captured PNG down to ``_SCREENSHOT_WIDTH`` (=800 px)
+    and re-encode as JPEG quality 75.

-    Returns (new_b64, physical_scale, css_scale) where:
-      physical_scale = physical_px_per_image_px  (multiply image coords → physical px)
-      css_scale      = css_px_per_image_px        (multiply image coords → CSS px for DOM APIs)
+    CDP captures at the physical-pixel resolution (DPR × CSS). We
+    downscale to 800 px wide so the delivered image stays under
+    Anthropic's vision-API resize cap — the model sees pixel-for-pixel
+    what we send.

-    Highlights have x,y,w,h in CSS pixels (what getBoundingClientRect returns,
-    and what CDP Input.dispatchMouseEvent accepts).
-    Falls back to original data if Pillow unavailable or resize fails.
+    Returns ``(new_b64, physical_scale, css_scale)`` where
+    - ``physical_scale = orig_png_w / _SCREENSHOT_WIDTH`` (image → physical px)
+    - ``css_scale      = css_width / _SCREENSHOT_WIDTH`` (image → CSS px)
+
+    Highlight rects arrive in CSS px and are divided by ``css_scale``
+    before drawing so overlays land in the correct spot on the
+    800-wide output.
    """
+    if not css_width or css_width <= 0:
+        # Bridge always supplies css_width from window.innerWidth; only
+        # reach here on a degraded response. Return the raw PNG.
+        return data, 1.0, 1.0
+
    try:
        from PIL import Image, ImageDraw, ImageFont
    except ImportError:
@@ -58,21 +77,16 @@ def _resize_and_annotate(
            import struct

            orig_w = struct.unpack(">I", raw[16:20])[0]
-        raw_size_bytes = len(raw)
-        physical_scale = orig_w / width if orig_w and width else 1.0
-        css_scale = (css_width / width) if css_width else (physical_scale / max(dpr, 1.0))
+        physical_scale = orig_w / _SCREENSHOT_WIDTH if orig_w else 1.0
+        css_scale = css_width / _SCREENSHOT_WIDTH
        logger.warning(
-            "PIL not available — screenshot resize SKIPPED (cannot downscale image). "
-            "raw_size=%d bytes, png_width=%d, css_width=%s, dpr=%s, target_width=%d. "
-            "Returning ORIGINAL image with computed scales: physicalScale=%.4f, cssScale=%.4f. "
-            "Agent must use browser_coords() to convert image positions before clicking.",
-            raw_size_bytes,
-            orig_w,
-            css_width,
-            dpr,
-            width,
+            "PIL not available — screenshot resize SKIPPED. "
+            "Returning raw physical-px PNG. physicalScale=%.4f, "
+            "cssScale=%.4f, css_width=%d, dpr=%s. Install Pillow for correct clicks.",
            physical_scale,
            css_scale,
+            css_width,
+            dpr,
        )
        return data, round(physical_scale, 4), round(css_scale, 4)

@@ -81,25 +95,25 @@ def _resize_and_annotate(
        img = Image.open(io.BytesIO(raw)).convert("RGBA")
        orig_w, orig_h = img.size

-        physical_scale = orig_w / width
-        css_scale = (css_width / width) if css_width else (physical_scale / max(dpr, 1.0))
+        physical_scale = orig_w / _SCREENSHOT_WIDTH
+        css_scale = css_width / _SCREENSHOT_WIDTH
+        new_w = _SCREENSHOT_WIDTH
+        new_h = round(orig_h * new_w / orig_w)
+        if (new_w, new_h) != img.size:
+            img = img.resize((new_w, new_h), Image.LANCZOS)

        logger.info(
-            "Screenshot resize: orig=%dx%d → target=%dx%d, css_width=%s, dpr=%s, physicalScale=%.4f, cssScale=%.4f",
+            "Screenshot: orig=%dx%d → out=%dx%d (css_width=%d, dpr=%s), physicalScale=%.4f, cssScale=%.4f",
            orig_w,
            orig_h,
-            width,
-            round(orig_h * width / orig_w),
+            new_w,
+            new_h,
            css_width,
            dpr,
            physical_scale,
            css_scale,
        )

-        new_w = width
-        new_h = round(orig_h * new_w / orig_w)
-        img = img.resize((new_w, new_h), Image.LANCZOS)
-
        if highlights:
            overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
            draw = ImageDraw.Draw(overlay)
@@ -111,7 +125,7 @@ def _resize_and_annotate(
            for h in highlights:
                kind = h.get("kind", "rect")
                label = h.get("label", "")
-                # Highlights are in CSS px → convert to image px
+                # Highlights arrive in CSS px → convert to image px.
                ix = h["x"] / css_scale
                iy = h["y"] / css_scale
                iw = h.get("w", 0) / css_scale
@@ -135,11 +149,9 @@ def _resize_and_annotate(
                        width=2,
                    )

-                # Label: show image pixel position so user knows where to look
-                img_coords = f"img:({round(ix)},{round(iy)})"
-                display_label = f"{img_coords} {label}" if label else img_coords
+                display_label = f"({round(ix)},{round(iy)}) {label}".strip()
                lx, ly = ix, max(2, iy - 16)
-                lx = max(2, min(lx, width - 120))
+                lx = max(2, min(lx, new_w - 120))
                bbox = draw.textbbox((lx, ly), display_label, font=font)
                pad = 3
                draw.rectangle(
@@ -153,7 +165,7 @@ def _resize_and_annotate(
            img = img.convert("RGB")

        buf = io.BytesIO()
-        img.save(buf, format="PNG", optimize=True)
+        img.save(buf, format="JPEG", quality=75, optimize=True)
        return (
            base64.b64encode(buf.getvalue()).decode(),
            round(physical_scale, 4),
@@ -161,16 +173,38 @@ def _resize_and_annotate(
        )
    except Exception:
        logger.warning(
-            "Screenshot resize/annotate FAILED — returning original image with scale=1.0. "
-            "css_width=%s, dpr=%s, target_width=%d. Clicks will be misaligned.",
+            "Screenshot resize/annotate FAILED — returning original image. "
+            "css_width=%s, dpr=%s.",
            css_width,
            dpr,
-            width,
            exc_info=True,
        )
        return data, 1.0, 1.0


+async def _ensure_css_scale(tab_id: int) -> float:
+    """Return the image→CSS scale for ``tab_id``, populating the cache
+    via ``window.innerWidth`` if missing. Used by click tools when the
+    agent clicks before the first screenshot has been taken.
+    """
+    cached = _screenshot_css_scales.get(tab_id)
+    if cached is not None and cached > 0:
+        return cached
+    bridge = get_bridge()
+    try:
+        result = await bridge.evaluate(tab_id, "({w: window.innerWidth})")
+        inner = float(((result or {}).get("result") or {}).get("w") or 0)
+    except Exception:
+        inner = 0.0
+    if inner <= 0:
+        # Degraded: no viewport width available. Treat image px as CSS px.
+        scale = 1.0
+    else:
+        scale = inner / _SCREENSHOT_WIDTH
+    _screenshot_css_scales[tab_id] = scale
+    return scale
+
+
 def register_inspection_tools(mcp: FastMCP) -> None:
    """Register browser inspection tools."""

@@ -180,26 +214,24 @@ def register_inspection_tools(mcp: FastMCP) -> None:
        profile: str | None = None,
        full_page: bool = False,
        selector: str | None = None,
-        image_type: Literal["png", "jpeg"] = "png",
        annotate: bool = True,
-        width: int = _SCREENSHOT_WIDTH,
    ) -> list:
        """
        Take a screenshot of the current page.

-        Returns a normalized image alongside text metadata (URL, size, scale
-        factors, etc.). Automatically annotates the last interaction (click,
-        hover, type) with a bounding box overlay.
+        Image is 800 px wide (JPEG quality 75, ~50–120 KB). A pixel you
+        see in this image is the same number you pass to
+        ``browser_click_coordinate`` / ``browser_hover_coordinate`` /
+        ``browser_press_at`` — the tools translate to CSS internally.
+        ``browser_get_rect`` and ``browser_shadow_query`` likewise
+        return coordinates in screenshot pixels.

        Args:
            tab_id: Chrome tab ID (default: active tab)
            profile: Browser profile name (default: "default")
            full_page: Capture full scrollable page (default: False)
            selector: CSS selector to screenshot a specific element (optional)
-            image_type: Image format - png or jpeg (default: png)
            annotate: Draw bounding box of last interaction on image (default: True)
-            width: Output image width in pixels (default: 600). Use 800+ for fine
-                   text, 400 for quick layout checks.

        Returns:
            List of content blocks: text metadata + image
@@ -252,7 +284,6 @@ def register_inspection_tools(mcp: FastMCP) -> None:
                return [TextContent(type="text", text=json.dumps(screenshot_result))]

            data = screenshot_result.get("data")
-            mime_type = screenshot_result.get("mimeType", "image/png")
            css_width = screenshot_result.get("cssWidth", 0)
            dpr = screenshot_result.get("devicePixelRatio", 1.0)

@@ -263,45 +294,45 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            if annotate and target_tab in _interaction_highlights:
                highlights = [_interaction_highlights[target_tab]]

-            # Normalize to 800px wide and annotate. Offloaded to a
-            # thread because PIL Image.open/resize/ImageDraw/composite on
-            # a 2-megapixel PNG blocks for ~150-300ms of CPU — plenty to
-            # freeze the asyncio event loop and delay every concurrent
-            # tool call during a screenshot. The function is reentrant
-            # (fresh PIL Image per call, no shared state), so to_thread
-            # is safe.
+            # Resize to CSS-viewport dimensions (image px == CSS px)
+            # and re-encode as JPEG. Offloaded to a thread because PIL
+            # Image.open/resize/ImageDraw/composite on a 2-megapixel
+            # PNG blocks for ~150–300 ms of CPU — plenty to freeze the
+            # asyncio event loop. Reentrant: no shared state.
            data, physical_scale, css_scale = await asyncio.to_thread(
                _resize_and_annotate,
                data,
                css_width,
                dpr,
                highlights,
-                width,
            )
-            _screenshot_scales[target_tab] = physical_scale
-            _screenshot_css_scales[target_tab] = css_scale
+            # Refresh caches so click / hover / press / rect tools can
+            # translate image px ↔ CSS px without asking the page again.
+            if target_tab is not None:
+                _screenshot_scales[target_tab] = physical_scale
+                _screenshot_css_scales[target_tab] = css_scale

            meta = json.dumps(
                {
                    "ok": True,
                    "tabId": target_tab,
                    "url": screenshot_result.get("url", ""),
-                    "imageType": mime_type.split("/")[-1],
+                    "imageType": "jpeg",
                    "size": len(base64.b64decode(data)) if data else 0,
-                    "imageWidth": width,
+                    "imageWidth": _SCREENSHOT_WIDTH,
+                    "cssWidth": css_width,
                    "fullPage": full_page,
                    "devicePixelRatio": dpr,
                    "physicalScale": physical_scale,
                    "cssScale": css_scale,
                    "annotated": bool(highlights),
                    "scaleHint": (
-                        f"image_coord × {css_scale} = CSS px "
-                        f"→ feed to browser_click_coordinate, "
-                        f"browser_hover_coordinate, browser_press_at "
-                        f"(CDP Input events use CSS pixels). "
-                        f"image_coord × {physical_scale} = physical px "
-                        f"is debug-only on HiDPI displays and must NOT "
-                        f"be used for clicks — it overshoots by DPR×."
+                        "Image is 800 px wide. Pass pixel coordinates "
+                        "you read off this image straight into "
+                        "browser_click_coordinate / "
+                        "browser_hover_coordinate / browser_press_at — "
+                        "the tools translate image px → CSS px "
+                        "internally (cssScale is for debug only)."
                    ),
                }
            )
@@ -313,17 +344,17 @@ def register_inspection_tools(mcp: FastMCP) -> None:
                    "ok": True,
                    "size": len(base64.b64decode(data)) if data else 0,
                    "url": screenshot_result.get("url", ""),
-                    "physicalScale": physical_scale,
+                    "cssWidth": css_width,
                    "cssScale": css_scale,
-                    "debug_cssWidth": css_width,
-                    "debug_dpr": dpr,
+                    "physicalScale": physical_scale,
+                    "dpr": dpr,
                },
                duration_ms=(time.perf_counter() - start) * 1000,
            )

            return [
                TextContent(type="text", text=meta),
-                ImageContent(type="image", data=data, mimeType=mime_type),
+                ImageContent(type="image", data=data, mimeType="image/jpeg"),
            ]
        except Exception as e:
            log_tool_call(
@@ -334,73 +365,6 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            )
            return [TextContent(type="text", text=json.dumps({"ok": False, "error": str(e)}))]

-    @mcp.tool()
-    def browser_coords(
-        x: float,
-        y: float,
-        tab_id: int | None = None,
-        profile: str | None = None,
-    ) -> dict:
-        """
-        Convert screenshot image coordinates to browser click coordinates.
-
-        After browser_screenshot returns a downscaled image, use this to
-        translate pixel positions you see in the image into the CSS pixel
-        coordinates that Chrome DevTools Protocol expects.
-
-        **CDP Input.dispatchMouseEvent uses CSS pixels**, so you want
-        ``css_x`` / ``css_y`` for every click/hover tool. ``physical_x/y``
-        is kept in the return for debugging on HiDPI displays — do NOT
-        feed it to clicks; on a DPR=2 screen it lands 2× too far.
-
-        Edge case: pages using ``zoom`` or ``transform: scale()`` (e.g.
-        LinkedIn's ``#interop-outlet`` shadow DOM) render in a scaled
-        local coordinate space. For those, ``getBoundingClientRect()``
-        reports pre-zoom coordinates and you may still need to multiply
-        by the element's effective zoom. Use browser_shadow_query to
-        get the zoomed rect directly.
-
-        Args:
-            x: X pixel position in the screenshot image
-            y: Y pixel position in the screenshot image
-            tab_id: Chrome tab ID (default: active tab for profile)
-            profile: Browser profile name (default: "default")
-
-        Returns:
-            Dict with css_x, css_y (primary — use these), physical_x,
-            physical_y (debug only), and scale factors.
-        """
-        ctx = _get_context(profile)
-        target_tab = tab_id or (ctx.get("activeTabId") if ctx else None)
-
-        physical_scale = _screenshot_scales.get(target_tab, 1.0) if target_tab else 1.0
-        # css_scale stored in second slot via _screenshot_css_scales
-        css_scale = _screenshot_css_scales.get(target_tab, physical_scale) if target_tab else physical_scale
-
-        return {
-            "ok": True,
-            # Primary output: CSS pixels. Feed these to click/hover/press.
-            "css_x": round(x * css_scale, 1),
-            "css_y": round(y * css_scale, 1),
-            # Debug output: raw physical pixels. DO NOT feed to clicks on
-            # HiDPI displays — CDP Input events use CSS pixels, so sending
-            # physical coordinates lands the click at roughly DPR× the
-            # intended position.
-            "physical_x": round(x * physical_scale, 1),
-            "physical_y": round(y * physical_scale, 1),
-            "physicalScale": physical_scale,
-            "cssScale": css_scale,
-            "tabId": target_tab,
-            "note": (
-                "Use css_x/css_y with browser_click_coordinate, "
-                "browser_hover_coordinate, browser_press_at — "
-                "Chrome DevTools Protocol Input.dispatchMouseEvent "
-                "operates in CSS pixels. physical_x/y is for debugging "
-                "on HiDPI displays only; feeding it to clicks lands "
-                "them at DPR× the intended coordinate."
-            ),
-        }
-
    @mcp.tool()
    async def browser_shadow_query(
        selector: str,
@@ -412,7 +376,9 @@ def register_inspection_tools(mcp: FastMCP) -> None:

        Traverses shadow roots to find elements inside closed/open shadow DOM,
        overlays, and virtual-rendered components (e.g. LinkedIn's #interop-outlet).
-        Returns getBoundingClientRect in both CSS and physical pixels.
+        Returns the element's bounding rect in screenshot pixels — feed
+        ``rect.cx`` / ``rect.cy`` straight into browser_click_coordinate
+        / hover_coordinate / press_at.

        Args:
            selector: CSS selectors joined by ' >>> ' to pierce shadow roots.
@@ -421,7 +387,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            profile: Browser profile name (default: "default")

        Returns:
-            Dict with rect (CSS px) and physical rect (CSS px × DPR) of the element
+            Dict with ``rect`` block (x, y, w, h, cx, cy) in screenshot pixels.
        """
        bridge = get_bridge()
        if not bridge or not bridge.is_connected:
@@ -438,36 +404,24 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            return result

        rect = result["rect"]
-        physical_scale = _screenshot_scales.get(target_tab, 1.0)
-        css_scale = _screenshot_css_scales.get(target_tab, 1.0)
-        dpr = physical_scale / css_scale if css_scale else 1.0
-
+        css_scale = await _ensure_css_scale(target_tab)
+        s = css_scale if css_scale > 0 else 1.0
        return {
            "ok": True,
            "selector": selector,
            "tag": rect.get("tag"),
-            "css": {
-                "x": rect["x"],
-                "y": rect["y"],
-                "w": rect["w"],
-                "h": rect["h"],
-                "cx": rect["cx"],
-                "cy": rect["cy"],
-            },
-            "physical": {
-                "x": round(rect["x"] * dpr, 1),
-                "y": round(rect["y"] * dpr, 1),
-                "w": round(rect["w"] * dpr, 1),
-                "h": round(rect["h"] * dpr, 1),
-                "cx": round(rect["cx"] * dpr, 1),
-                "cy": round(rect["cy"] * dpr, 1),
+            "rect": {
+                "x": round(rect["x"] / s, 1),
+                "y": round(rect["y"] / s, 1),
+                "w": round(rect["w"] / s, 1),
+                "h": round(rect["h"] / s, 1),
+                "cx": round(rect["cx"] / s, 1),
+                "cy": round(rect["cy"] / s, 1),
            },
            "note": (
-                "Use css.cx/cy with browser_click_coordinate, "
-                "browser_hover_coordinate, browser_press_at — "
-                "CDP Input events operate in CSS pixels. "
-                "physical.* is debug-only; feeding it to clicks "
-                "lands them DPR× too far on HiDPI displays."
+                "rect fields are in screenshot pixels. Pass rect.cx / "
+                "rect.cy to browser_click_coordinate / "
+                "hover_coordinate / press_at."
            ),
        }

@@ -480,11 +434,10 @@ def register_inspection_tools(mcp: FastMCP) -> None:
        """
        Get the bounding rect of an element by CSS selector.

-        Supports '>>>' shadow-piercing selectors for overlay/shadow DOM content.
-        Returns coordinates in CSS pixels (for clicks and DOM APIs); the
-        physical-pixel variant is returned for debugging on HiDPI displays
-        only — it must not be fed to click/hover/press tools, which use
-        CSS pixels.
+        Supports '>>>' shadow-piercing selectors for overlay/shadow DOM
+        content. Returns the rect in screenshot pixels — the same
+        numbers you'd read off a browser_screenshot, and the same
+        numbers browser_click_coordinate expects.

        Args:
            selector: CSS selector, optionally with ' >>> ' to pierce shadow roots.
@@ -493,7 +446,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            profile: Browser profile name (default: "default")

        Returns:
-            Dict with css and physical bounding rects
+            Dict with ``rect`` block (x, y, w, h, cx, cy) in screenshot pixels.
        """
        bridge = get_bridge()
        if not bridge or not bridge.is_connected:
@@ -510,36 +463,24 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            return result

        rect = result["rect"]
-        physical_scale = _screenshot_scales.get(target_tab, 1.0)
-        css_scale = _screenshot_css_scales.get(target_tab, 1.0)
-        dpr = physical_scale / css_scale if css_scale else 1.0
-
+        css_scale = await _ensure_css_scale(target_tab)
+        s = css_scale if css_scale > 0 else 1.0
        return {
            "ok": True,
            "selector": selector,
            "tag": rect.get("tag"),
-            "css": {
-                "x": rect["x"],
-                "y": rect["y"],
-                "w": rect["w"],
-                "h": rect["h"],
-                "cx": rect["cx"],
-                "cy": rect["cy"],
-            },
-            "physical": {
-                "x": round(rect["x"] * dpr, 1),
-                "y": round(rect["y"] * dpr, 1),
-                "w": round(rect["w"] * dpr, 1),
-                "h": round(rect["h"] * dpr, 1),
-                "cx": round(rect["cx"] * dpr, 1),
-                "cy": round(rect["cy"] * dpr, 1),
+            "rect": {
+                "x": round(rect["x"] / s, 1),
+                "y": round(rect["y"] / s, 1),
+                "w": round(rect["w"] / s, 1),
+                "h": round(rect["h"] / s, 1),
+                "cx": round(rect["cx"] / s, 1),
+                "cy": round(rect["cy"] / s, 1),
            },
            "note": (
-                "Use css.cx/cy with browser_click_coordinate, "
-                "browser_hover_coordinate, browser_press_at — "
-                "CDP Input events operate in CSS pixels. "
-                "physical.* is debug-only; feeding it to clicks "
-                "lands them DPR× too far on HiDPI displays."
+                "rect fields are in screenshot pixels. Pass rect.cx / "
+                "rect.cy to browser_click_coordinate / "
+                "hover_coordinate / press_at."
            ),
        }

@@ -108,24 +108,25 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        button: Literal["left", "right", "middle"] = "left",
    ) -> dict:
        """
-        Click at specific viewport coordinates (CSS pixels).
+        Click at the given SCREENSHOT pixel.

-        Chrome DevTools Protocol's Input.dispatchMouseEvent operates in
-        **CSS pixels**, not physical pixels. If you have a screenshot
-        image coordinate, convert it with ``browser_coords(x, y)`` and
-        use the returned ``css_x`` / ``css_y`` — not ``physical_x/y``.
-        On a DPR=2 display, feeding physical coordinates lands the click
-        at 2× the intended position.
+        ``x`` and ``y`` are pixel coordinates read directly off a
+        ``browser_screenshot`` image (800 px wide JPEG). The tool
+        multiplies them by the cached image→CSS scale for the tab
+        before dispatching to Chrome — no scale awareness required on
+        the caller side. ``browser_get_rect`` / ``browser_shadow_query``
+        return coordinates in the same (screenshot) space.

        Args:
-            x: X coordinate in CSS pixels (viewport space)
-            y: Y coordinate in CSS pixels (viewport space)
+            x: X coordinate in screenshot pixels.
+            y: Y coordinate in screenshot pixels.
            tab_id: Chrome tab ID (default: active tab)
            profile: Browser profile name (default: "default")
            button: Mouse button to click (left, right, middle)

        Returns:
-            Dict with click result
+            Dict with click result, including ``focused_element``
+            describing what the click focused.
        """
        start = time.perf_counter()
        params = {"x": x, "y": y, "tab_id": tab_id, "profile": profile, "button": button}
@@ -149,17 +150,17 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            return result

        try:
-            from .inspection import _screenshot_css_scales, _screenshot_scales
+            from .inspection import _ensure_css_scale

-            click_result = await bridge.click_coordinate(target_tab, x, y, button=button)
+            css_scale = await _ensure_css_scale(target_tab)
+            s = css_scale if css_scale > 0 else 1.0
+            css_x = x * s
+            css_y = y * s
+            click_result = await bridge.click_coordinate(target_tab, css_x, css_y, button=button)
            log_tool_call(
                "browser_click_coordinate",
                params,
-                result={
-                    **click_result,
-                    "debug_stored_physicalScale": _screenshot_scales.get(target_tab, "unset"),
-                    "debug_stored_cssScale": _screenshot_css_scales.get(target_tab, "unset"),
-                },
+                result={**click_result, "cssScale": round(css_scale, 4)},
                duration_ms=(time.perf_counter() - start) * 1000,
            )
            return click_result
@@ -484,15 +485,17 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        profile: str | None = None,
    ) -> dict:
        """
-        Hover at CSS pixel coordinates without needing a CSS selector.
+        Hover at the given SCREENSHOT pixel.

        Use this instead of browser_hover when the element is in an overlay,
        shadow DOM, or virtual-rendered component that isn't in the regular DOM.
-        Pair with browser_coords to convert screenshot image positions to CSS pixels.
+        ``x`` / ``y`` are pixel coordinates read directly off a
+        ``browser_screenshot`` image; the tool translates to CSS px
+        internally before dispatching to Chrome.

        Args:
-            x: CSS pixel X coordinate
-            y: CSS pixel Y coordinate
+            x: X coordinate in screenshot pixels.
+            y: Y coordinate in screenshot pixels.
            tab_id: Chrome tab ID (default: active tab)
            profile: Browser profile name (default: "default")

@@ -521,7 +524,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            return result

        try:
-            hover_result = await bridge.hover_coordinate(target_tab, x, y)
+            from .inspection import _ensure_css_scale
+
+            css_scale = await _ensure_css_scale(target_tab)
+            s = css_scale if css_scale > 0 else 1.0
+            hover_result = await bridge.hover_coordinate(target_tab, x * s, y * s)
            log_tool_call(
                "browser_hover_coordinate",
                params,
@@ -548,16 +555,18 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        profile: str | None = None,
    ) -> dict:
        """
-        Move mouse to CSS pixel coordinates then press a key.
+        Move mouse to the given SCREENSHOT pixel, then press a key.

        Use this instead of browser_press when the focused element is in an overlay
        or virtual-rendered component. Moving the mouse first routes the key event
        through native browser hit-testing instead of the DOM focus chain.
-        Pair with browser_coords to convert screenshot image positions to CSS pixels.
+        ``x`` / ``y`` are pixel coordinates read directly off a
+        ``browser_screenshot`` image; the tool translates to CSS px
+        internally.

        Args:
-            x: CSS pixel X coordinate to position mouse
-            y: CSS pixel Y coordinate to position mouse
+            x: X coordinate in screenshot pixels.
+            y: Y coordinate in screenshot pixels.
            key: Key to press (e.g. 'Enter', 'Space', 'Escape', 'ArrowDown')
            tab_id: Chrome tab ID (default: active tab)
            profile: Browser profile name (default: "default")
@@ -587,7 +596,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            return result

        try:
-            press_result = await bridge.press_key_at(target_tab, x, y, key)
+            from .inspection import _ensure_css_scale
+
+            css_scale = await _ensure_css_scale(target_tab)
+            s = css_scale if css_scale > 0 else 1.0
+            press_result = await bridge.press_key_at(target_tab, x * s, y * s, key)
            log_tool_call(
                "browser_press_at",
                params,
@@ -139,7 +139,10 @@ def main() -> None:
        mcp.run(transport="stdio")
    else:
        logger.info(f"Starting GCU server on {args.host}:{args.port}")
-        mcp.run(transport="http", host=args.host, port=args.port)
+        # FastMCP.run() forwards kwargs to anyio.run() instead of the
+        # transport, which breaks host/port for SSE. Invoke run_async
+        # directly so the kwargs land on run_sse_async.
+        asyncio.run(mcp.run_async(transport="sse", host=args.host, port=args.port))


 if __name__ == "__main__":