Merge branch 'feature/full-image-size'

2026-04-16 23:15:59 -07:00
parent 4303a36df0 558813e7fa
commit 83cc44bdef
11 changed files with 415 additions and 340 deletions
@@ -93,33 +93,57 @@ def clear_tab_highlights(tab_ids) -> None:
        _interaction_highlights.pop(tid, None)


-# Compact descriptor of document.activeElement. Returned by both click()
+# Compact descriptor of the focused element. Returned by both click()
 # and click_coordinate() so the agent can verify it focused what it
-# intended, then decide whether to follow up with browser_type_focused(text=...).
-# Keeping this as a single shared string avoids drift
-# between the two click paths.
+# intended. When the outer document's activeElement is an <iframe>,
+# we recurse into the iframe's document (same-origin only) so the
+# response describes the real inner element — otherwise the agent
+# always sees {tag: "iframe"} and can't tell whether it hit the
+# composer or something else inside the frame (e.g. a sidebar item
+# in LinkedIn's #interop-outlet messaging overlay).
 _FOCUSED_ELEMENT_JS = """
 (function() {
+    function describe(el) {
+        var rect = el.getBoundingClientRect();
+        var attrs = {};
+        for (var i = 0; i < el.attributes.length && i < 10; i++) {
+            attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
+        }
+        return {
+            tag: el.tagName.toLowerCase(),
+            id: el.id || null,
+            className: el.className || null,
+            name: el.getAttribute('name') || null,
+            type: el.getAttribute('type') || null,
+            role: el.getAttribute('role') || null,
+            contenteditable: el.getAttribute('contenteditable') || null,
+            text: (el.innerText || '').substring(0, 200),
+            value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
+            attributes: attrs,
+            rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
+        };
+    }
    var el = document.activeElement;
    if (!el || el === document.body) return null;
-    var rect = el.getBoundingClientRect();
-    var attrs = {};
-    for (var i = 0; i < el.attributes.length && i < 10; i++) {
-        attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
+    // Descend into same-origin iframes. Capped at 5 levels of
+    // nesting to bound cost. Cross-origin frames throw on
+    // contentDocument access → we catch and report the outermost
+    // iframe instead.
+    var framePath = [];
+    var depth = 0;
+    while (el && (el.tagName === 'IFRAME' || el.tagName === 'FRAME') && depth < 5) {
+        framePath.push(el.id || el.getAttribute('data-testid') || el.tagName.toLowerCase());
+        var innerDoc = null;
+        try { innerDoc = el.contentDocument; } catch (e) { innerDoc = null; }
+        if (!innerDoc) break;
+        var innerActive = innerDoc.activeElement;
+        if (!innerActive || innerActive === innerDoc.body) break;
+        el = innerActive;
+        depth++;
    }
-    return {
-        tag: el.tagName.toLowerCase(),
-        id: el.id || null,
-        className: el.className || null,
-        name: el.getAttribute('name') || null,
-        type: el.getAttribute('type') || null,
-        role: el.getAttribute('role') || null,
-        contenteditable: el.getAttribute('contenteditable') || null,
-        text: (el.innerText || '').substring(0, 200),
-        value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
-        attributes: attrs,
-        rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
-    };
+    var out = describe(el);
+    if (framePath.length) out.inFrame = framePath;
+    return out;
 })()
 """

@@ -477,9 +501,9 @@ class BeelineBridge:
        """Close a tab by ID."""
        result = await self._send("tab.close", tabId=tab_id)
        # Drop per-tab state — the id may be reused by Chrome much
-        # later, and carrying a stale highlight, scale, or "attached"
-        # flag forward would misannotate screenshots, misalign click
-        # coordinates, or skip a needed reattach on the reused id.
+        # later, and carrying a stale highlight or "attached" flag
+        # forward would misannotate screenshots or skip a needed
+        # reattach on the reused id.
        self._cdp_attached.discard(tab_id)
        _interaction_highlights.pop(tab_id, None)
        from .tools.inspection import clear_tab_state
@@ -953,16 +977,36 @@ class BeelineBridge:
    async def _read_focused_element(self, tab_id: int) -> dict | None:
        """Read document.activeElement and return a compact descriptor.

-        Returns None on any failure — never raises. Used by both click
-        paths (selector-based click() and click_coordinate()) so the
-        agent gets the same response shape regardless of which one was
-        called. The descriptor lets the agent answer "did my click land
-        on an editable?" without a second round-trip.
+        The JS returns ``rect`` fields in CSS px (they come straight
+        from ``getBoundingClientRect``). We convert them to fractions
+        of the viewport here so the agent sees a rect in the same
+        coord space it passed to click / hover / press_at.
+
+        Returns None on any failure — never raises.
        """
        try:
            await self._try_enable_domain(tab_id, "Runtime")
            result = await self.evaluate(tab_id, _FOCUSED_ELEMENT_JS)
-            return (result or {}).get("result")
+            info = (result or {}).get("result")
+            if info and isinstance(info, dict) and isinstance(info.get("rect"), dict):
+                from .tools.inspection import _viewport_sizes
+
+                vp = _viewport_sizes.get(tab_id)
+                if vp and vp[0] > 0 and vp[1] > 0:
+                    cw, ch = float(vp[0]), float(vp[1])
+                    r = info["rect"]
+                    info["rect"] = {
+                        "x": round(r.get("x", 0) / cw, 4),
+                        "y": round(r.get("y", 0) / ch, 4),
+                        "width": round(r.get("width", 0) / cw, 4),
+                        "height": round(r.get("height", 0) / ch, 4),
+                    }
+                else:
+                    # Degraded: cache missing (no screenshot taken
+                    # yet). Leave rect in CSS px and flag it so the
+                    # agent can tell.
+                    info["rectSpace"] = "css"
+            return info
        except Exception:
            return None

@@ -975,18 +1019,11 @@ class BeelineBridge:
        button_map = {"left": "left", "right": "right", "middle": "middle"}
        cdp_button = button_map.get(button, "left")

-        from .tools.inspection import _screenshot_css_scales, _screenshot_scales
-
-        phys_scale = _screenshot_scales.get(tab_id, "unset")
-        css_scale = _screenshot_css_scales.get(tab_id, "unset")
        logger.info(
-            "click_coordinate tab=%d: x=%.1f, y=%.1f → CDP Input.dispatchMouseEvent. "
-            "stored_scales: physicalScale=%s, cssScale=%s",
+            "click_coordinate tab=%d: x=%.1f, y=%.1f → CDP Input.dispatchMouseEvent",
            tab_id,
            x,
            y,
-            phys_scale,
-            css_scale,
        )

        await self._cdp(
@@ -255,6 +255,17 @@ def register_advanced_tools(mcp: FastMCP) -> None:

        try:
            result = await bridge.resize(target_tab, width, height)
+            # Invalidate per-tab scale caches — CSS width changed, so the
+            # cached viewport dimensions are stale. Click / rect tools
+            # will re-query innerWidth / innerHeight on next use via
+            # _ensure_viewport_size.
+            try:
+                from .inspection import _screenshot_scales, _viewport_sizes
+
+                _viewport_sizes.pop(target_tab, None)
+                _screenshot_scales.pop(target_tab, None)
+            except Exception:
+                pass
            return result
        except Exception as e:
            return {"ok": False, "error": str(e)}
@@ -23,13 +23,26 @@ from .tabs import _get_context

 logger = logging.getLogger(__name__)

-# Target width for normalized screenshots (px in the delivered image)
-_SCREENSHOT_WIDTH = 600

-# Maps tab_id -> physical scale: image_coord × scale = physical pixels (for CDP Input events)
+# Fixed output width for all screenshots (bandwidth default). This
+# number does NOT affect coordinate semantics — click / hover / press
+# and rect tools all work in fractions of the viewport (0..1), which
+# are invariant to whatever resize / tile the vision API applies. The
+# 800 px width is simply small enough to keep JPEG payloads under
+# ~150 KB on typical UI screenshots.
+_SCREENSHOT_WIDTH = 800
+
+# Per-tab viewport-size cache populated on every browser_screenshot
+# and on lazy-init inside the click tools. Stores CSS-pixel viewport
+# dimensions (window.innerWidth / window.innerHeight). Click tools
+# multiply fractional inputs by these to get CSS coords before
+# dispatching CDP events; rect tools divide CSS-pixel DOM rects by
+# these to produce fractions for the agent.
+_viewport_sizes: dict[int, tuple[int, int]] = {}
+
+# Optional debug cache — physical-px scale per tab (orig_png_w /
+# _SCREENSHOT_WIDTH). Logged only; no consumer.
 _screenshot_scales: dict[int, float] = {}
-# Maps tab_id -> CSS scale: image_coord × scale = CSS pixels (for DOM APIs / getBoundingClientRect)
-_screenshot_css_scales: dict[int, float] = {}


 def clear_tab_state(tab_ids) -> None:
@@ -51,18 +64,25 @@ def _resize_and_annotate(
    css_width: int,
    dpr: float = 1.0,
    highlights: list[dict] | None = None,
-    width: int = _SCREENSHOT_WIDTH,
-) -> tuple[str, float, float]:
-    """Resize a base64 PNG to _SCREENSHOT_WIDTH wide, annotate highlights.
+) -> tuple[str, float]:
+    """Resize the captured PNG down to ``_SCREENSHOT_WIDTH`` (=800 px)
+    and re-encode as JPEG quality 75.

-    Returns (new_b64, physical_scale, css_scale) where:
-      physical_scale = physical_px_per_image_px  (multiply image coords → physical px)
-      css_scale      = css_px_per_image_px        (multiply image coords → CSS px for DOM APIs)
+    The image dimensions do NOT determine click coordinates any more —
+    the tools work in viewport fractions. This helper exists purely
+    for bandwidth + annotation overlay. Returns ``(new_b64,
+    physical_scale)`` where ``physical_scale = orig_png_w / output_w``
+    is kept for debug logging.

-    Highlights have x,y,w,h in CSS pixels (what getBoundingClientRect returns,
-    and what CDP Input.dispatchMouseEvent accepts).
-    Falls back to original data if Pillow unavailable or resize fails.
+    Highlight rects arrive in CSS px; they're converted to image-space
+    for overlay drawing via the local ``css_to_image = css_width /
+    output_w`` factor (computed inline — no external cache).
    """
+    if not css_width or css_width <= 0:
+        # Bridge always supplies css_width from window.innerWidth; only
+        # reach here on a degraded response. Return the raw PNG.
+        return data, 1.0
+
    try:
        from PIL import Image, ImageDraw, ImageFont
    except ImportError:
@@ -72,48 +92,44 @@ def _resize_and_annotate(
            import struct

            orig_w = struct.unpack(">I", raw[16:20])[0]
-        raw_size_bytes = len(raw)
-        physical_scale = orig_w / width if orig_w and width else 1.0
-        css_scale = (css_width / width) if css_width else (physical_scale / max(dpr, 1.0))
+        physical_scale = orig_w / _SCREENSHOT_WIDTH if orig_w else 1.0
        logger.warning(
-            "PIL not available — screenshot resize SKIPPED (cannot downscale image). "
-            "raw_size=%d bytes, png_width=%d, css_width=%s, dpr=%s, target_width=%d. "
-            "Returning ORIGINAL image with computed scales: physicalScale=%.4f, cssScale=%.4f. "
-            "Agent must use browser_coords() to convert image positions before clicking.",
-            raw_size_bytes,
-            orig_w,
+            "PIL not available — screenshot resize SKIPPED. "
+            "Returning raw physical-px PNG. physicalScale=%.4f, "
+            "css_width=%d, dpr=%s. Install Pillow for annotation.",
+            physical_scale,
            css_width,
            dpr,
-            width,
-            physical_scale,
-            css_scale,
        )
-        return data, round(physical_scale, 4), round(css_scale, 4)
+        return data, round(physical_scale, 4)

    try:
        raw = base64.b64decode(data)
        img = Image.open(io.BytesIO(raw)).convert("RGBA")
        orig_w, orig_h = img.size

-        physical_scale = orig_w / width
-        css_scale = (css_width / width) if css_width else (physical_scale / max(dpr, 1.0))
+        physical_scale = orig_w / _SCREENSHOT_WIDTH
+        new_w = _SCREENSHOT_WIDTH
+        new_h = round(orig_h * new_w / orig_w)
+        if (new_w, new_h) != img.size:
+            img = img.resize((new_w, new_h), Image.LANCZOS)
+
+        # Local CSS → image px factor for overlay draws. Kept local —
+        # not exported, not stored, not leaked to the agent.
+        css_to_image = css_width / _SCREENSHOT_WIDTH

        logger.info(
-            "Screenshot resize: orig=%dx%d → target=%dx%d, css_width=%s, dpr=%s, physicalScale=%.4f, cssScale=%.4f",
+            "Screenshot: orig=%dx%d → out=%dx%d (css_width=%d, dpr=%s), physicalScale=%.4f, css_to_image=%.4f",
            orig_w,
            orig_h,
-            width,
-            round(orig_h * width / orig_w),
+            new_w,
+            new_h,
            css_width,
            dpr,
            physical_scale,
-            css_scale,
+            css_to_image,
        )

-        new_w = width
-        new_h = round(orig_h * new_w / orig_w)
-        img = img.resize((new_w, new_h), Image.LANCZOS)
-
        if highlights:
            overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
            draw = ImageDraw.Draw(overlay)
@@ -125,11 +141,11 @@ def _resize_and_annotate(
            for h in highlights:
                kind = h.get("kind", "rect")
                label = h.get("label", "")
-                # Highlights are in CSS px → convert to image px
-                ix = h["x"] / css_scale
-                iy = h["y"] / css_scale
-                iw = h.get("w", 0) / css_scale
-                ih = h.get("h", 0) / css_scale
+                # Highlights arrive in CSS px → convert to image px.
+                ix = h["x"] / css_to_image
+                iy = h["y"] / css_to_image
+                iw = h.get("w", 0) / css_to_image
+                ih = h.get("h", 0) / css_to_image

                if kind == "point":
                    cx, cy, r = ix, iy, 10
@@ -149,11 +165,9 @@ def _resize_and_annotate(
                        width=2,
                    )

-                # Label: show image pixel position so user knows where to look
-                img_coords = f"img:({round(ix)},{round(iy)})"
-                display_label = f"{img_coords} {label}" if label else img_coords
+                display_label = f"({round(ix)},{round(iy)}) {label}".strip()
                lx, ly = ix, max(2, iy - 16)
-                lx = max(2, min(lx, width - 120))
+                lx = max(2, min(lx, new_w - 120))
                bbox = draw.textbbox((lx, ly), display_label, font=font)
                pad = 3
                draw.rectangle(
@@ -167,22 +181,50 @@ def _resize_and_annotate(
            img = img.convert("RGB")

        buf = io.BytesIO()
-        img.save(buf, format="PNG", optimize=True)
+        img.save(buf, format="JPEG", quality=75, optimize=True)
        return (
            base64.b64encode(buf.getvalue()).decode(),
            round(physical_scale, 4),
-            round(css_scale, 4),
        )
    except Exception:
        logger.warning(
-            "Screenshot resize/annotate FAILED — returning original image with scale=1.0. "
-            "css_width=%s, dpr=%s, target_width=%d. Clicks will be misaligned.",
+            "Screenshot resize/annotate FAILED — returning original image. "
+            "css_width=%s, dpr=%s.",
            css_width,
            dpr,
-            width,
            exc_info=True,
        )
-        return data, 1.0, 1.0
+        return data, 1.0
+
+
+async def _ensure_viewport_size(tab_id: int) -> tuple[int, int]:
+    """Return ``(cssWidth, cssHeight)`` for ``tab_id``, populating the
+    cache via ``window.innerWidth`` / ``window.innerHeight`` on miss.
+
+    Used by click / hover / press tools to turn fractional inputs
+    (0..1) into CSS px, and by rect tools to turn CSS-px rects into
+    fractions. Degrades to ``(1, 1)`` if the bridge can't be queried
+    — that makes every coord an identity op, which is a safe no-op
+    (and preferable to crashing).
+    """
+    cached = _viewport_sizes.get(tab_id)
+    if cached is not None and cached[0] > 0 and cached[1] > 0:
+        return cached
+    bridge = get_bridge()
+    try:
+        result = await bridge.evaluate(tab_id, "({w: window.innerWidth, h: window.innerHeight})")
+        inner = (result or {}).get("result") or {}
+        cw = int(float(inner.get("w") or 0))
+        ch = int(float(inner.get("h") or 0))
+    except Exception:
+        cw, ch = 0, 0
+    if cw <= 0 or ch <= 0:
+        # Degraded: bridge didn't return viewport. Cache an identity
+        # so we don't retry on every call; corrects itself after the
+        # next successful browser_screenshot.
+        cw, ch = 1, 1
+    _viewport_sizes[tab_id] = (cw, ch)
+    return cw, ch


 def register_inspection_tools(mcp: FastMCP) -> None:
@@ -194,29 +236,33 @@ def register_inspection_tools(mcp: FastMCP) -> None:
        profile: str | None = None,
        full_page: bool = False,
        selector: str | None = None,
-        image_type: Literal["png", "jpeg"] = "png",
        annotate: bool = True,
-        width: int = _SCREENSHOT_WIDTH,
    ) -> list:
        """
        Take a screenshot of the current page.

-        Returns a normalized image alongside text metadata (URL, size, scale
-        factors, etc.). Automatically annotates the last interaction (click,
-        hover, type) with a bounding box overlay.
+        Image is 800 px wide (JPEG quality 75, ~50–120 KB). All
+        coordinate tools work in **fractions of the viewport (0..1)**,
+        not pixels — so read a target's proportional position off this
+        image ("~35 % from the left, ~20 % from the top") and pass
+        ``(0.35, 0.20)`` to ``browser_click_coordinate`` /
+        ``browser_hover_coordinate`` / ``browser_press_at``.
+        ``browser_get_rect`` and ``browser_shadow_query`` likewise
+        return coordinates as fractions.

        Args:
            tab_id: Chrome tab ID (default: active tab)
            profile: Browser profile name (default: "default")
-            full_page: Capture full scrollable page (default: False)
+            full_page: Capture full scrollable page (default: False).
+                Note: full_page images extend beyond the viewport, so
+                fractions read off them do NOT map cleanly to
+                viewport-space clicks. Use for reading / overview only,
+                not for pointing.
            selector: CSS selector to screenshot a specific element (optional)
-            image_type: Image format - png or jpeg (default: png)
            annotate: Draw bounding box of last interaction on image (default: True)
-            width: Output image width in pixels (default: 600). Use 800+ for fine
-                   text, 400 for quick layout checks.

        Returns:
-            List of content blocks: text metadata + image
+            List of content blocks: text metadata + image.
        """
        start = time.perf_counter()
        params = {
@@ -266,7 +312,6 @@ def register_inspection_tools(mcp: FastMCP) -> None:
                return [TextContent(type="text", text=json.dumps(screenshot_result))]

            data = screenshot_result.get("data")
-            mime_type = screenshot_result.get("mimeType", "image/png")
            css_width = screenshot_result.get("cssWidth", 0)
            dpr = screenshot_result.get("devicePixelRatio", 1.0)

@@ -277,45 +322,50 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            if annotate and target_tab in _interaction_highlights:
                highlights = [_interaction_highlights[target_tab]]

-            # Normalize to 800px wide and annotate. Offloaded to a
-            # thread because PIL Image.open/resize/ImageDraw/composite on
-            # a 2-megapixel PNG blocks for ~150-300ms of CPU — plenty to
-            # freeze the asyncio event loop and delay every concurrent
-            # tool call during a screenshot. The function is reentrant
-            # (fresh PIL Image per call, no shared state), so to_thread
-            # is safe.
-            data, physical_scale, css_scale = await asyncio.to_thread(
+            # Resize to CSS-viewport dimensions (image px == CSS px)
+            # and re-encode as JPEG. Offloaded to a thread because PIL
+            # Image.open/resize/ImageDraw/composite on a 2-megapixel
+            # PNG blocks for ~150–300 ms of CPU — plenty to freeze the
+            # asyncio event loop. Reentrant: no shared state.
+            data, physical_scale = await asyncio.to_thread(
                _resize_and_annotate,
                data,
                css_width,
                dpr,
                highlights,
-                width,
            )
-            _screenshot_scales[target_tab] = physical_scale
-            _screenshot_css_scales[target_tab] = css_scale
+            # Cache live viewport dimensions so click / hover / press /
+            # rect tools can translate fractions ↔ CSS px without
+            # asking the page again.
+            css_height = int(screenshot_result.get("cssHeight", 0)) or 0
+            if target_tab is not None and css_width > 0 and css_height > 0:
+                _viewport_sizes[target_tab] = (int(css_width), css_height)
+                _screenshot_scales[target_tab] = physical_scale

            meta = json.dumps(
                {
                    "ok": True,
                    "tabId": target_tab,
                    "url": screenshot_result.get("url", ""),
-                    "imageType": mime_type.split("/")[-1],
+                    "imageType": "jpeg",
                    "size": len(base64.b64decode(data)) if data else 0,
-                    "imageWidth": width,
+                    "imageWidth": _SCREENSHOT_WIDTH,
+                    "cssWidth": css_width,
+                    "cssHeight": css_height,
                    "fullPage": full_page,
                    "devicePixelRatio": dpr,
                    "physicalScale": physical_scale,
-                    "cssScale": css_scale,
                    "annotated": bool(highlights),
                    "scaleHint": (
-                        f"image_coord × {css_scale} = CSS px "
-                        f"→ feed to browser_click_coordinate, "
-                        f"browser_hover_coordinate, browser_press_at "
-                        f"(CDP Input events use CSS pixels). "
-                        f"image_coord × {physical_scale} = physical px "
-                        f"is debug-only on HiDPI displays and must NOT "
-                        f"be used for clicks — it overshoots by DPR×."
+                        "Coordinates for click / hover / press are "
+                        "fractions 0..1 of the viewport. Read a "
+                        "target's proportional position off this image "
+                        "(e.g. '~35 % from the left, ~20 % from the top' "
+                        "→ (0.35, 0.20)) and pass that to "
+                        "browser_click_coordinate / "
+                        "browser_hover_coordinate / browser_press_at. "
+                        "browser_get_rect / browser_shadow_query / "
+                        "focused_element.rect return fractions too."
                    ),
                }
            )
@@ -327,17 +377,17 @@ def register_inspection_tools(mcp: FastMCP) -> None:
                    "ok": True,
                    "size": len(base64.b64decode(data)) if data else 0,
                    "url": screenshot_result.get("url", ""),
+                    "cssWidth": css_width,
+                    "cssHeight": css_height,
                    "physicalScale": physical_scale,
-                    "cssScale": css_scale,
-                    "debug_cssWidth": css_width,
-                    "debug_dpr": dpr,
+                    "dpr": dpr,
                },
                duration_ms=(time.perf_counter() - start) * 1000,
            )

            return [
                TextContent(type="text", text=meta),
-                ImageContent(type="image", data=data, mimeType=mime_type),
+                ImageContent(type="image", data=data, mimeType="image/jpeg"),
            ]
        except Exception as e:
            log_tool_call(
@@ -348,73 +398,6 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            )
            return [TextContent(type="text", text=json.dumps({"ok": False, "error": str(e)}))]

-    @mcp.tool()
-    def browser_coords(
-        x: float,
-        y: float,
-        tab_id: int | None = None,
-        profile: str | None = None,
-    ) -> dict:
-        """
-        Convert screenshot image coordinates to browser click coordinates.
-
-        After browser_screenshot returns a downscaled image, use this to
-        translate pixel positions you see in the image into the CSS pixel
-        coordinates that Chrome DevTools Protocol expects.
-
-        **CDP Input.dispatchMouseEvent uses CSS pixels**, so you want
-        ``css_x`` / ``css_y`` for every click/hover tool. ``physical_x/y``
-        is kept in the return for debugging on HiDPI displays — do NOT
-        feed it to clicks; on a DPR=2 screen it lands 2× too far.
-
-        Edge case: pages using ``zoom`` or ``transform: scale()`` (e.g.
-        LinkedIn's ``#interop-outlet`` shadow DOM) render in a scaled
-        local coordinate space. For those, ``getBoundingClientRect()``
-        reports pre-zoom coordinates and you may still need to multiply
-        by the element's effective zoom. Use browser_shadow_query to
-        get the zoomed rect directly.
-
-        Args:
-            x: X pixel position in the screenshot image
-            y: Y pixel position in the screenshot image
-            tab_id: Chrome tab ID (default: active tab for profile)
-            profile: Browser profile name (default: "default")
-
-        Returns:
-            Dict with css_x, css_y (primary — use these), physical_x,
-            physical_y (debug only), and scale factors.
-        """
-        ctx = _get_context(profile)
-        target_tab = tab_id or (ctx.get("activeTabId") if ctx else None)
-
-        physical_scale = _screenshot_scales.get(target_tab, 1.0) if target_tab else 1.0
-        # css_scale stored in second slot via _screenshot_css_scales
-        css_scale = _screenshot_css_scales.get(target_tab, physical_scale) if target_tab else physical_scale
-
-        return {
-            "ok": True,
-            # Primary output: CSS pixels. Feed these to click/hover/press.
-            "css_x": round(x * css_scale, 1),
-            "css_y": round(y * css_scale, 1),
-            # Debug output: raw physical pixels. DO NOT feed to clicks on
-            # HiDPI displays — CDP Input events use CSS pixels, so sending
-            # physical coordinates lands the click at roughly DPR× the
-            # intended position.
-            "physical_x": round(x * physical_scale, 1),
-            "physical_y": round(y * physical_scale, 1),
-            "physicalScale": physical_scale,
-            "cssScale": css_scale,
-            "tabId": target_tab,
-            "note": (
-                "Use css_x/css_y with browser_click_coordinate, "
-                "browser_hover_coordinate, browser_press_at — "
-                "Chrome DevTools Protocol Input.dispatchMouseEvent "
-                "operates in CSS pixels. physical_x/y is for debugging "
-                "on HiDPI displays only; feeding it to clicks lands "
-                "them at DPR× the intended coordinate."
-            ),
-        }
-
    @mcp.tool()
    async def browser_shadow_query(
        selector: str,
@@ -426,7 +409,9 @@ def register_inspection_tools(mcp: FastMCP) -> None:

        Traverses shadow roots to find elements inside closed/open shadow DOM,
        overlays, and virtual-rendered components (e.g. LinkedIn's #interop-outlet).
-        Returns getBoundingClientRect in both CSS and physical pixels.
+        Returns the element's bounding rect as **fractions of the
+        viewport (0..1)** — feed ``rect.cx`` / ``rect.cy`` straight
+        into browser_click_coordinate / hover_coordinate / press_at.

        Args:
            selector: CSS selectors joined by ' >>> ' to pierce shadow roots.
@@ -435,7 +420,8 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            profile: Browser profile name (default: "default")

        Returns:
-            Dict with rect (CSS px) and physical rect (CSS px × DPR) of the element
+            Dict with ``rect`` block (x, y, w, h, cx, cy) as fractions,
+            plus ``cssWidth`` / ``cssHeight`` for reference.
        """
        bridge = get_bridge()
        if not bridge or not bridge.is_connected:
@@ -452,36 +438,27 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            return result

        rect = result["rect"]
-        physical_scale = _screenshot_scales.get(target_tab, 1.0)
-        css_scale = _screenshot_css_scales.get(target_tab, 1.0)
-        dpr = physical_scale / css_scale if css_scale else 1.0
-
+        cw, ch = await _ensure_viewport_size(target_tab)
+        cw_f = float(cw) if cw > 0 else 1.0
+        ch_f = float(ch) if ch > 0 else 1.0
        return {
            "ok": True,
            "selector": selector,
            "tag": rect.get("tag"),
-            "css": {
-                "x": rect["x"],
-                "y": rect["y"],
-                "w": rect["w"],
-                "h": rect["h"],
-                "cx": rect["cx"],
-                "cy": rect["cy"],
-            },
-            "physical": {
-                "x": round(rect["x"] * dpr, 1),
-                "y": round(rect["y"] * dpr, 1),
-                "w": round(rect["w"] * dpr, 1),
-                "h": round(rect["h"] * dpr, 1),
-                "cx": round(rect["cx"] * dpr, 1),
-                "cy": round(rect["cy"] * dpr, 1),
+            "rect": {
+                "x": round(rect["x"] / cw_f, 4),
+                "y": round(rect["y"] / ch_f, 4),
+                "w": round(rect["w"] / cw_f, 4),
+                "h": round(rect["h"] / ch_f, 4),
+                "cx": round(rect["cx"] / cw_f, 4),
+                "cy": round(rect["cy"] / ch_f, 4),
            },
+            "cssWidth": cw,
+            "cssHeight": ch,
            "note": (
-                "Use css.cx/cy with browser_click_coordinate, "
-                "browser_hover_coordinate, browser_press_at — "
-                "CDP Input events operate in CSS pixels. "
-                "physical.* is debug-only; feeding it to clicks "
-                "lands them DPR× too far on HiDPI displays."
+                "rect fields are fractions of the viewport (0..1). "
+                "Pass rect.cx / rect.cy to browser_click_coordinate / "
+                "hover_coordinate / press_at."
            ),
        }

@@ -494,11 +471,10 @@ def register_inspection_tools(mcp: FastMCP) -> None:
        """
        Get the bounding rect of an element by CSS selector.

-        Supports '>>>' shadow-piercing selectors for overlay/shadow DOM content.
-        Returns coordinates in CSS pixels (for clicks and DOM APIs); the
-        physical-pixel variant is returned for debugging on HiDPI displays
-        only — it must not be fed to click/hover/press tools, which use
-        CSS pixels.
+        Supports '>>>' shadow-piercing selectors for overlay/shadow DOM
+        content. Returns the rect as **fractions of the viewport
+        (0..1)** — the same coordinate space browser_click_coordinate
+        / hover_coordinate / press_at expect.

        Args:
            selector: CSS selector, optionally with ' >>> ' to pierce shadow roots.
@@ -507,7 +483,8 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            profile: Browser profile name (default: "default")

        Returns:
-            Dict with css and physical bounding rects
+            Dict with ``rect`` block (x, y, w, h, cx, cy) as fractions,
+            plus ``cssWidth`` / ``cssHeight`` for reference.
        """
        bridge = get_bridge()
        if not bridge or not bridge.is_connected:
@@ -524,36 +501,27 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            return result

        rect = result["rect"]
-        physical_scale = _screenshot_scales.get(target_tab, 1.0)
-        css_scale = _screenshot_css_scales.get(target_tab, 1.0)
-        dpr = physical_scale / css_scale if css_scale else 1.0
-
+        cw, ch = await _ensure_viewport_size(target_tab)
+        cw_f = float(cw) if cw > 0 else 1.0
+        ch_f = float(ch) if ch > 0 else 1.0
        return {
            "ok": True,
            "selector": selector,
            "tag": rect.get("tag"),
-            "css": {
-                "x": rect["x"],
-                "y": rect["y"],
-                "w": rect["w"],
-                "h": rect["h"],
-                "cx": rect["cx"],
-                "cy": rect["cy"],
-            },
-            "physical": {
-                "x": round(rect["x"] * dpr, 1),
-                "y": round(rect["y"] * dpr, 1),
-                "w": round(rect["w"] * dpr, 1),
-                "h": round(rect["h"] * dpr, 1),
-                "cx": round(rect["cx"] * dpr, 1),
-                "cy": round(rect["cy"] * dpr, 1),
+            "rect": {
+                "x": round(rect["x"] / cw_f, 4),
+                "y": round(rect["y"] / ch_f, 4),
+                "w": round(rect["w"] / cw_f, 4),
+                "h": round(rect["h"] / ch_f, 4),
+                "cx": round(rect["cx"] / cw_f, 4),
+                "cy": round(rect["cy"] / ch_f, 4),
            },
+            "cssWidth": cw,
+            "cssHeight": ch,
            "note": (
-                "Use css.cx/cy with browser_click_coordinate, "
-                "browser_hover_coordinate, browser_press_at — "
-                "CDP Input events operate in CSS pixels. "
-                "physical.* is debug-only; feeding it to clicks "
-                "lands them DPR× too far on HiDPI displays."
+                "rect fields are fractions of the viewport (0..1). "
+                "Pass rect.cx / rect.cy to browser_click_coordinate / "
+                "hover_coordinate / press_at."
            ),
        }

@@ -108,24 +108,31 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        button: Literal["left", "right", "middle"] = "left",
    ) -> dict:
        """
-        Click at specific viewport coordinates (CSS pixels).
+        Click at a FRACTION of the viewport (0..1, 0..1).

-        Chrome DevTools Protocol's Input.dispatchMouseEvent operates in
-        **CSS pixels**, not physical pixels. If you have a screenshot
-        image coordinate, convert it with ``browser_coords(x, y)`` and
-        use the returned ``css_x`` / ``css_y`` — not ``physical_x/y``.
-        On a DPR=2 display, feeding physical coordinates lands the click
-        at 2× the intended position.
+        Coordinates are **fractions of the viewport**, not pixels:
+        ``(0.5, 0.5)`` is the center, ``(0.1, 0.2)`` is 10 % from the
+        left and 20 % from the top. Read a target's proportional
+        position off ``browser_screenshot`` (or pass
+        ``rect.cx`` / ``rect.cy`` from ``browser_get_rect`` /
+        ``browser_shadow_query`` directly — they return fractions too).
+
+        Fractions are used because every vision model resizes or tiles
+        images differently (Claude ~1.15 MP target, GPT-4o 512-px
+        tiles, etc.). Proportional positions survive every such
+        transform; pixel coords do not.

        Args:
-            x: X coordinate in CSS pixels (viewport space)
-            y: Y coordinate in CSS pixels (viewport space)
+            x: X fraction of the viewport (0..1).
+            y: Y fraction of the viewport (0..1).
            tab_id: Chrome tab ID (default: active tab)
            profile: Browser profile name (default: "default")
            button: Mouse button to click (left, right, middle)

        Returns:
-            Dict with click result
+            Dict with click result, including ``focused_element``
+            describing what the click focused. ``focused_element.rect``
+            is also in fractions.
        """
        start = time.perf_counter()
        params = {"x": x, "y": y, "tab_id": tab_id, "profile": profile, "button": button}
@@ -148,18 +155,33 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            log_tool_call("browser_click_coordinate", params, result=result)
            return result

-        try:
-            from .inspection import _screenshot_css_scales, _screenshot_scales
+        # Pixel-input guard: legitimate fractions live in [0, 1]. Allow a
+        # small overshoot tolerance for edge targets.
+        if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
+            result = {
+                "ok": False,
+                "error": (
+                    f"Coords ({x}, {y}) look like pixels. This tool expects "
+                    "fractions 0..1 of the viewport. Read the target's "
+                    "proportional position off browser_screenshot, or pass "
+                    "rect.cx / rect.cy from browser_get_rect / "
+                    "browser_shadow_query (they return fractions)."
+                ),
+            }
+            log_tool_call("browser_click_coordinate", params, result=result)
+            return result

-            click_result = await bridge.click_coordinate(target_tab, x, y, button=button)
+        try:
+            from .inspection import _ensure_viewport_size
+
+            cw, ch = await _ensure_viewport_size(target_tab)
+            css_x = x * cw
+            css_y = y * ch
+            click_result = await bridge.click_coordinate(target_tab, css_x, css_y, button=button)
            log_tool_call(
                "browser_click_coordinate",
                params,
-                result={
-                    **click_result,
-                    "debug_stored_physicalScale": _screenshot_scales.get(target_tab, "unset"),
-                    "debug_stored_cssScale": _screenshot_css_scales.get(target_tab, "unset"),
-                },
+                result={**click_result, "cssWidth": cw, "cssHeight": ch},
                duration_ms=(time.perf_counter() - start) * 1000,
            )
            return click_result
@@ -484,15 +506,16 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        profile: str | None = None,
    ) -> dict:
        """
-        Hover at CSS pixel coordinates without needing a CSS selector.
+        Hover at a FRACTION of the viewport (0..1, 0..1).

        Use this instead of browser_hover when the element is in an overlay,
        shadow DOM, or virtual-rendered component that isn't in the regular DOM.
-        Pair with browser_coords to convert screenshot image positions to CSS pixels.
+        ``x`` / ``y`` are fractions of the viewport (``0.5`` = center);
+        the tool converts to CSS px internally.

        Args:
-            x: CSS pixel X coordinate
-            y: CSS pixel Y coordinate
+            x: X fraction of the viewport (0..1).
+            y: Y fraction of the viewport (0..1).
            tab_id: Chrome tab ID (default: active tab)
            profile: Browser profile name (default: "default")

@@ -520,8 +543,22 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            log_tool_call("browser_hover_coordinate", params, result=result)
            return result

+        if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
+            result = {
+                "ok": False,
+                "error": (
+                    f"Coords ({x}, {y}) look like pixels. This tool expects "
+                    "fractions 0..1 of the viewport."
+                ),
+            }
+            log_tool_call("browser_hover_coordinate", params, result=result)
+            return result
+
        try:
-            hover_result = await bridge.hover_coordinate(target_tab, x, y)
+            from .inspection import _ensure_viewport_size
+
+            cw, ch = await _ensure_viewport_size(target_tab)
+            hover_result = await bridge.hover_coordinate(target_tab, x * cw, y * ch)
            log_tool_call(
                "browser_hover_coordinate",
                params,
@@ -548,16 +585,17 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        profile: str | None = None,
    ) -> dict:
        """
-        Move mouse to CSS pixel coordinates then press a key.
+        Move mouse to a FRACTION of the viewport (0..1, 0..1), then press a key.

        Use this instead of browser_press when the focused element is in an overlay
        or virtual-rendered component. Moving the mouse first routes the key event
        through native browser hit-testing instead of the DOM focus chain.
-        Pair with browser_coords to convert screenshot image positions to CSS pixels.
+        ``x`` / ``y`` are fractions of the viewport; the tool converts
+        to CSS px internally.

        Args:
-            x: CSS pixel X coordinate to position mouse
-            y: CSS pixel Y coordinate to position mouse
+            x: X fraction of the viewport (0..1).
+            y: Y fraction of the viewport (0..1).
            key: Key to press (e.g. 'Enter', 'Space', 'Escape', 'ArrowDown')
            tab_id: Chrome tab ID (default: active tab)
            profile: Browser profile name (default: "default")
@@ -586,8 +624,22 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            log_tool_call("browser_press_at", params, result=result)
            return result

+        if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
+            result = {
+                "ok": False,
+                "error": (
+                    f"Coords ({x}, {y}) look like pixels. This tool expects "
+                    "fractions 0..1 of the viewport."
+                ),
+            }
+            log_tool_call("browser_press_at", params, result=result)
+            return result
+
        try:
-            press_result = await bridge.press_key_at(target_tab, x, y, key)
+            from .inspection import _ensure_viewport_size
+
+            cw, ch = await _ensure_viewport_size(target_tab)
+            press_result = await bridge.press_key_at(target_tab, x * cw, y * ch, key)
            log_tool_call(
                "browser_press_at",
                params,
@@ -139,7 +139,10 @@ def main() -> None:
        mcp.run(transport="stdio")
    else:
        logger.info(f"Starting GCU server on {args.host}:{args.port}")
-        mcp.run(transport="http", host=args.host, port=args.port)
+        # FastMCP.run() forwards kwargs to anyio.run() instead of the
+        # transport, which breaks host/port for SSE. Invoke run_async
+        # directly so the kwargs land on run_sse_async.
+        asyncio.run(mcp.run_async(transport="sse", host=args.host, port=args.port))


 if __name__ == "__main__":