From aba0ff07ba42e5d0dbe259b90d02852bdd95afd0 Mon Sep 17 00:00:00 2001
From: Timothy <timothy@adenhq.com>
Date: Thu, 16 Apr 2026 20:29:05 -0700
Subject: [PATCH 1/2] fix: model invariant screenshot

---
 .claude/settings.json                         |  13 +-
 .mcp.json                                     |   9 +-
 .../agents/queen/reference/gcu_guide.md       |   5 +-
 core/framework/orchestrator/gcu.py            |  39 +-
 .../browser-automation/SKILL.md               |  54 +--
 .../linkedin-automation/SKILL.md              |   2 +-
 tools/src/gcu/browser/bridge.py               | 104 ++++--
 tools/src/gcu/browser/tools/advanced.py       |  10 +
 tools/src/gcu/browser/tools/inspection.py     | 335 ++++++++----------
 tools/src/gcu/browser/tools/interactions.py   |  67 ++--
 tools/src/gcu/server.py                       |   5 +-
 11 files changed, 320 insertions(+), 323 deletions(-)

diff --git a/.claude/settings.json b/.claude/settings.json
index fbdc243f..1b61758d 100644
--- a/.claude/settings.json
+++ b/.claude/settings.json
@@ -44,7 +44,18 @@
       "WebFetch(domain:docs.litellm.ai)",
       "Bash(cat /home/timothy/aden/hive/.venv/lib/python3.11/site-packages/litellm-*.dist-info/METADATA)",
       "Bash(find \"/home/timothy/.hive/agents/queens/queen_brand_design/sessions/session_20260415_100751_d49f4c28/\" -type f -name \"*.json*\" -exec grep -l \"协日\" {} \\\\;)",
-      "Bash(grep -v ':0$')"
+      "Bash(grep -v ':0$')",
+      "Bash(curl -s -m 2 http://127.0.0.1:4002/sse -o /dev/null -w 'status=%{http_code} time=%{time_total}s\\\\n')",
+      "mcp__gcu-tools__browser_status",
+      "mcp__gcu-tools__browser_start",
+      "mcp__gcu-tools__browser_navigate",
+      "mcp__gcu-tools__browser_evaluate",
+      "mcp__gcu-tools__browser_screenshot",
+      "mcp__gcu-tools__browser_open",
+      "mcp__gcu-tools__browser_click_coordinate",
+      "mcp__gcu-tools__browser_get_rect",
+      "mcp__gcu-tools__browser_type_focused",
+      "mcp__gcu-tools__browser_wait"
     ],
     "additionalDirectories": [
       "/home/timothy/.hive/skills/writing-hive-skills",
diff --git a/.mcp.json b/.mcp.json
index da39e4ff..b37e500c 100644
--- a/.mcp.json
+++ b/.mcp.json
@@ -1,3 +1,10 @@
 {
-  "mcpServers": {}
+  "mcpServers": {
+    "gcu-tools": {
+      "type": "stdio",
+      "command": "uv",
+      "args": ["run", "python", "-m", "gcu.server", "--stdio"],
+      "cwd": "/home/timothy/aden/hive/tools"
+    }
+  }
 }
diff --git a/core/framework/agents/queen/reference/gcu_guide.md b/core/framework/agents/queen/reference/gcu_guide.md
index e0b8bd1d..33b7894c 100644
--- a/core/framework/agents/queen/reference/gcu_guide.md
+++ b/core/framework/agents/queen/reference/gcu_guide.md
@@ -25,7 +25,6 @@ All tools are prefixed with `browser_`:
 - `browser_screenshot` — visual capture (annotated PNG)
 <!-- /vision-only -->
 - `browser_shadow_query`, `browser_get_rect` — locate elements (shadow-piercing via `>>>`)
-- `browser_coords` — convert image pixels to CSS pixels (always use `css_x/y`, never `physical_x/y`)
 - `browser_scroll`, `browser_wait` — navigation helpers
 - `browser_evaluate` — run JavaScript
 - `browser_close`, `browser_close_finished` — tab cleanup
@@ -38,9 +37,9 @@ All tools are prefixed with `browser_`:
 
 Neither tool is "preferred" universally — they're for different jobs. Default to snapshot on text-heavy static pages, screenshot on SPAs and anything shadow-DOM-heavy. Activate the `browser-automation` skill for the full decision tree.
 
-## Coordinate rule: always CSS pixels
+## Coordinate rule
 
-Chrome DevTools Protocol `Input.dispatchMouseEvent` takes **CSS pixels**, not physical pixels. After a screenshot, use `browser_coords(image_x, image_y)` and feed the returned `css_x/y` (NOT `physical_x/y`) to `browser_click_coordinate`, `browser_hover_coordinate`, `browser_press_at`. Feeding physical pixels on a HiDPI display (DPR=1.6, 2, or 3) overshoots by `DPR×` and clicks land in the wrong place. `getBoundingClientRect()` already returns CSS pixels — pass through unchanged, no DPR multiplication.
+Every browser tool that takes or returns coordinates operates in **screenshot pixels** (the 800 px wide JPEG `browser_screenshot` delivers). Read a pixel off the image, pass it straight to `browser_click_coordinate` / `browser_hover_coordinate` / `browser_press_at`. `browser_get_rect` and `browser_shadow_query` return `rect.cx` / `rect.cy` in the same space. The tools translate to CSS px internally — no scale awareness required. Avoid raw `getBoundingClientRect()` via `browser_evaluate` for coord lookup; use `browser_get_rect` instead.
 
 ## System prompt tips for browser nodes
 
diff --git a/core/framework/orchestrator/gcu.py b/core/framework/orchestrator/gcu.py
index 43ce1fff..6951ce61 100644
--- a/core/framework/orchestrator/gcu.py
+++ b/core/framework/orchestrator/gcu.py
@@ -42,25 +42,21 @@ after an interaction unless you need a fresh view.
 Only fall back to `browser_get_text` for extracting small elements by
 CSS selector.
 
-## Coordinates: always CSS pixels
+## Coordinates
 
-Chrome DevTools Protocol `Input.dispatchMouseEvent` takes **CSS
-pixels**, not physical pixels. This is critical and often gets wrong:
+Every browser tool that takes or returns coordinates operates in
+**screenshot pixels** (the 800 px wide JPEG `browser_screenshot`
+delivers). Read a pixel off the image, pass it to
+`browser_click_coordinate` / `browser_hover_coordinate` /
+`browser_press_at`. `browser_get_rect` and `browser_shadow_query`
+return `rect.cx` / `rect.cy` in the same space. The tools handle the
+image-px → CSS-px translation internally; you do not need to know
+about CSS pixels, DPR, or any scale factor.
 
-| Tool | Unit |
-|---|---|
-| `browser_click_coordinate(x, y)` | **CSS pixels** |
-| `browser_hover_coordinate(x, y)` | **CSS pixels** |
-| `browser_press_at(x, y, key)` | **CSS pixels** |
-| `getBoundingClientRect()` | already CSS pixels — pass straight through |
-| `browser_coords(img_x, img_y)` | returns `css_x/y` (use this) and `physical_x/y` (debug only) |
-
-**Always use `css_x/y`** from `browser_coords`. Feeding `physical_x/y`
-on a HiDPI display overshoots by `DPR×` — clicks land DPR times too
-far right and down. On a DPR=1.6 display that's 60% off.
-
-Never multiply `getBoundingClientRect()` by `devicePixelRatio` — it's
-already in the right unit.
+Avoid raw `browser_evaluate` + `getBoundingClientRect()` for coord
+lookup — that returns CSS px and will be mis-scaled when fed to click
+tools. Prefer `browser_get_rect` / `browser_shadow_query`, which
+convert for you.
 
 ## Rich-text editors (X, LinkedIn DMs, Gmail, Reddit, Slack, Discord)
 
@@ -88,11 +84,10 @@ reach shadow elements transparently.
 
 **Shadow-heavy site workflow:**
 1. `browser_screenshot()` → visual image
-2. Identify target visually → image coordinate
-3. `browser_coords(x, y)` → CSS px
-4. `browser_click_coordinate(css_x, css_y)` → lands via native hit
-   test; inputs get focused regardless of shadow depth
-5. Type via `browser_type_focused` (no selector needed — types into the
+2. Identify target visually → pixel `(x, y)` read straight off the image
+3. `browser_click_coordinate(x, y)` → lands via native hit test;
+   inputs get focused regardless of shadow depth
+4. Type via `browser_type_focused` (no selector needed — types into the
    already-focused element), or `browser_type` if you have a selector
 
 For selector-style access when you know the shadow path:
diff --git a/core/framework/skills/_default_skills/browser-automation/SKILL.md b/core/framework/skills/_default_skills/browser-automation/SKILL.md
index 0a0e7d7d..b3896f5b 100644
--- a/core/framework/skills/_default_skills/browser-automation/SKILL.md
+++ b/core/framework/skills/_default_skills/browser-automation/SKILL.md
@@ -12,25 +12,20 @@ metadata:
 
 All GCU browser tools drive a real Chrome instance through the Beeline extension and Chrome DevTools Protocol (CDP). That means clicks, keystrokes, and screenshots are processed by the actual browser's native hit testing, focus, and layout engines — **not** a synthetic event layer. Understanding this unlocks strategies that make hard sites easy.
 
-## Coordinates: always CSS pixels
+## Coordinates
 
-**Chrome DevTools Protocol `Input.dispatchMouseEvent` operates in CSS pixels, not physical pixels.**
-
-When you call `browser_coords(image_x, image_y)` after a screenshot, the returned dict has both `css_x/y` and `physical_x/y`. **Always use `css_x/y` for clicks, hovers, and key presses.**
+Every browser tool that takes or returns coordinates operates in **screenshot pixels** (the 800 px wide JPEG `browser_screenshot` delivers). Take a screenshot, read a pixel off the image, pass that number to `browser_click_coordinate` / `browser_hover_coordinate` / `browser_press_at`. Rect-returning tools (`browser_get_rect`, `browser_shadow_query`, and the `rect` inside `focused_element`) also return screenshot pixels. You do not need to convert anything, track scale factors, or know about CSS pixels or device pixel ratio — the tools translate internally before dispatching to Chrome.
 
 ```
-browser_screenshot()          → image (downscaled to 800/900 px wide)
-browser_coords(img_x, img_y)  → {css_x, css_y, physical_x, physical_y}
-browser_click_coordinate(css_x, css_y)   ← USE css_x/y
-browser_hover_coordinate(css_x, css_y)   ← USE css_x/y
-browser_press_at(css_x, css_y, key)      ← USE css_x/y
+browser_screenshot()                  → image (800 px wide JPEG)
+browser_click_coordinate(x, y)        → x, y are screenshot px
+browser_hover_coordinate(x, y)        → x, y are screenshot px
+browser_press_at(x, y, key)           → x, y are screenshot px
+browser_get_rect(selector) → rect     → rect.cx / rect.cy are screenshot px
+browser_shadow_query(...)  → rect     → same
 ```
 
-Feeding `physical_x/y` on a HiDPI display overshoots by DPR× — on a DPR=1.6 laptop, clicks land 60% too far right and down. The ratio between `physicalScale` and `cssScale` tells you the effective DPR.
-
-`getBoundingClientRect()` already returns CSS pixels — feed those values straight through to click/hover tools without any DPR multiplication.
-
-**Exception for zoomed elements:** pages that use `zoom` or `transform: scale()` on a container (LinkedIn's `#interop-outlet`, some embedded iframes) render in a scaled local coordinate space. `getBoundingClientRect` there may not match CDP's hit space. Use `browser_shadow_query` which handles the math, or fall back to visually picking coordinates from a screenshot.
+**Exception for zoomed elements:** pages that use `zoom` or `transform: scale()` on a container (LinkedIn's `#interop-outlet`, some embedded iframes) render in a scaled local coordinate space. `getBoundingClientRect` there may not match CDP's hit space. Prefer `browser_shadow_query` (which handles the math) or visually pick coordinates from a screenshot. When in doubt, avoid raw `browser_evaluate` + `getBoundingClientRect()` for coord lookup — that returns CSS px and will be mis-scaled when passed to click tools.
 
 ## Screenshot + coordinates is shadow-agnostic — prefer it on shadow-heavy sites
 
@@ -38,7 +33,7 @@ On sites that use Shadow DOM heavily (Reddit's faceplate Web Components, LinkedI
 
 Why:
 
-- **CDP hit testing walks shadow roots natively.** `browser_click_coordinate(css_x, css_y)` routes through Chrome's native hit tester, which traverses open shadow roots automatically. You don't need to know the shadow structure.
+- **CDP hit testing walks shadow roots natively.** `browser_click_coordinate(x, y)` routes through Chrome's native hit tester, which traverses open shadow roots automatically. You don't need to know the shadow structure.
 - **Keyboard dispatch follows focus** into shadow roots. After a click focuses an input (even one three shadow levels deep), `browser_press(...)` with no selector dispatches keys to `document.activeElement`'s computed focus target.
 - **Screenshots render the real layout** regardless of DOM implementation.
 
@@ -46,12 +41,11 @@ Whereas `wait_for_selector`, `browser_click(selector=...)`, `browser_type(select
 
 ### Recommended workflow on shadow-heavy sites
 
-1. `browser_screenshot()` → visual image
-2. Identify the target visually → image pixel `(x, y)` (eyeball from the screenshot)
-3. `browser_coords(x, y)` → convert to CSS px
-4. `browser_click_coordinate(css_x, css_y)` → lands on the element via native hit testing; inputs get focused. **The response now includes `focused_element: {tag, id, role, contenteditable, rect, ...}`** — use it to verify you actually focused what you intended.
-5. `browser_type_focused(text="...")` → dispatches CDP `Input.insertText` to `document.activeElement`. Shadow roots, iframes, Lexical, Draft.js, ProseMirror all just work. Use `browser_type(selector, text)` instead when you have a reliable CSS selector for a light-DOM element.
-6. Verify via `browser_screenshot` OR `browser_get_attribute` on a known-reachable marker (e.g. check that the Send button's `aria-disabled` flipped to `false`).
+1. `browser_screenshot()` → 800 px wide JPEG.
+2. Identify the target visually → pixel `(x, y)` read straight off the image.
+3. `browser_click_coordinate(x, y)` → lands on the element via native hit testing; inputs get focused. **The response includes `focused_element: {tag, id, role, contenteditable, rect, inFrame?, ...}`** — use it to verify you actually focused what you intended. `rect` is in screenshot pixels (same space as the image). When focus is inside a same-origin iframe, the descriptor reports the inner element and adds `inFrame: [...]` breadcrumbs.
+4. `browser_type_focused(text="...")` → inserts text into `document.activeElement` (traverses into same-origin iframes automatically). Shadow roots, iframes, Lexical, Draft.js, ProseMirror all just work. Use `browser_type(selector, text)` instead when you have a reliable CSS selector for a light-DOM element.
+5. Verify via `browser_screenshot` OR `browser_get_attribute` on a known-reachable marker (e.g. check that the Send button's `aria-disabled` flipped to `false`).
 
 ### The click→type loop (canonical pattern)
 
@@ -80,7 +74,7 @@ browser_shadow_query("reddit-search-large >>> #search-input")
 browser_get_rect("#interop-outlet >>> #ember37 >>> p")
 ```
 
-Returns the element's rect in **CSS pixels** (feed directly to click tools). Remember: `browser_type` and `wait_for_selector` do **not** support `>>>` — only shadow_query and get_rect do.
+Returns the element's rect in **screenshot pixels** (feed `rect.cx` / `rect.cy` directly to click tools). Remember: `browser_type` and `wait_for_selector` do **not** support `>>>` — only shadow_query and get_rect do.
 
 ## Navigation and waiting
 
@@ -220,25 +214,15 @@ Recognized without modifiers: `Enter`, `Tab`, `Escape`, `Backspace`, `Delete`, `
 ## Screenshots
 
 ```
-browser_screenshot()                    # viewport, 900 px wide by default
+browser_screenshot()                    # viewport, 800 px wide JPEG
 browser_screenshot(full_page=True)      # full scrollable page
 browser_screenshot(selector="#header")  # clip to element's rect
 ```
 
-Returns a PNG with automatic downscaling to a target width (default 900 px) plus a JSON metadata block containing `cssWidth`, `devicePixelRatio`, `physicalScale`, `cssScale`, and a `scaleHint` string. The image is also annotated with a highlight rectangle/dot showing the last interaction (click, hover, type) if one happened on this tab.
+Returns a JPEG (quality 75, ~50–120 KB) fixed at **800 px wide** — well below the vision-API resize threshold, so the model sees the exact pixels we emit. Metadata includes `imageWidth` (800), `cssWidth` (the page's real viewport width), `cssScale` (for debug only), and `physicalScale`. The image is annotated with a highlight rectangle/dot showing the last interaction (click, hover, type) if one happened on this tab.
 
 The highlight overlay stays visible on the page for **10 seconds** after each interaction, then fades. Before a screenshot is likely, make sure your click / hover / type happens <10 s before the screenshot.
 
-### Anatomy of the scale fields
-
-- `cssWidth` = `window.innerWidth` (CSS px)
-- `devicePixelRatio` = `window.devicePixelRatio` (often 1.6, 2, or 3 on modern displays)
-- `physicalScale = png_width / image_width` (how many physical-px per image-px)
-- `cssScale = cssWidth / image_width` (how many CSS-px per image-px)
-- Effective DPR = `physicalScale / cssScale` (should match `devicePixelRatio`)
-
-When converting image coordinates for clicks, always use `cssScale`. The `physicalScale` field is there for debugging HiDPI displays, not for inputs.
-
 ## Scrolling
 
 - Use large scroll amounts (~2000) when loading more content — sites like Twitter and LinkedIn have lazy loading for paging.
@@ -363,7 +347,7 @@ Then pass the most specific selector that uniquely identifies the right input (e
 - **Typing into a rich-text editor without clicking first → send button stays disabled.** Draft.js (X), Lexical (Gmail, LinkedIn DMs), ProseMirror (Reddit), and React-controlled `contenteditable` elements only register input as "real" when the element received a native focus event — JS-sourced `.focus()` is not enough. `browser_type` now does this automatically via a real CDP pointer click before inserting text, but always verify the submit button's `disabled` state before clicking send. See the "ALWAYS click before typing" section above.
 - **Using per-character `keyDown` on Lexical / Draft.js editors → keys dispatch but text never appears.** Those editors intercept `beforeinput` and route insertion through their own state machine; raw keyDown events are silently dropped. `browser_type` now uses `Input.insertText` by default (the CDP IME-commit method) which these editors accept cleanly. Only set `use_insert_text=False` when you explicitly need per-keystroke dispatch.
 - **Leaving a composer with text then trying to navigate → `beforeunload` dialog hangs the bridge.** LinkedIn and several other sites pop a native "unsent message" confirm. `browser_navigate` and `close_tab` both time out against this. Always strip `window.onbeforeunload = null` via `browser_evaluate` before any navigation after typing in a composer, or wrap your logic in a `try/finally` that runs the cleanup block.
-- **Clicking at physical pixels.** CDP uses CSS px. `browser_coords` returns both for debugging, but always feed `css_x/y` to click tools.
+- **Click landed in the wrong region (sidebar / header instead of target).** Check `focused_element` in the click response — it's ground truth for what actually got focused, including the `inFrame` breadcrumb when focus ends up inside a same-origin iframe. If it isn't the target (e.g. `className: "msg-conversation-listitem__link"` when you meant to hit a composer), adjust the pixel and retry. Coordinates you pass are screenshot pixels; the tool translates to CSS px internally, so a wrong result means you picked the wrong pixel off the image — not that any scale went sideways.
 - **Calling `wait_for_selector` on a shadow element.** It'll always time out. Use `browser_shadow_query` or the screenshot + coordinate strategy.
 - **Relying on `innerHTML` in injected scripts on LinkedIn.** Silently discarded. Use `createElement` + `appendChild`.
 - **Not waiting for SPA hydration.** `wait_until="load"` fires before React/Vue rendering on many sites. Add a 2–3 s sleep before querying for chrome elements.
diff --git a/core/framework/skills/_default_skills/linkedin-automation/SKILL.md b/core/framework/skills/_default_skills/linkedin-automation/SKILL.md
index 844a904b..9ced68a4 100644
--- a/core/framework/skills/_default_skills/linkedin-automation/SKILL.md
+++ b/core/framework/skills/_default_skills/linkedin-automation/SKILL.md
@@ -34,7 +34,7 @@ LinkedIn is the hardest mainstream site to automate because it combines **shadow
 | Pending connection card | `.invitation-card, .invitations-card, [data-test-incoming-invitation-card]` | Filter out "invited you to follow" / "subscribe" cards |
 | Accept button | `button[aria-label*="Accept"]` within the card scope | Per-card scoping is critical — there are many Accept buttons on the page |
 
-LinkedIn changes class names aggressively. If a class-based selector breaks, fall back to **`browser_screenshot` → visual identification → `browser_coords` → `browser_click_coordinate`**. The screenshot + coord path works regardless of class-name churn and regardless of shadow DOM.
+LinkedIn changes class names aggressively. If a class-based selector breaks, fall back to **`browser_screenshot` → visual identification → `browser_click_coordinate`** with the pixel you read straight off the image (screenshots are CSS-sized, no conversion). The screenshot + coord path works regardless of class-name churn and regardless of shadow DOM.
 
 ## Profile Message flow (verified end-to-end 2026-04-11)
 
diff --git a/tools/src/gcu/browser/bridge.py b/tools/src/gcu/browser/bridge.py
index fcd15552..3f12159b 100644
--- a/tools/src/gcu/browser/bridge.py
+++ b/tools/src/gcu/browser/bridge.py
@@ -80,33 +80,57 @@ async def _adaptive_poll_sleep(elapsed_s: float) -> None:
 _interaction_highlights: dict[int, dict] = {}
 
 
-# Compact descriptor of document.activeElement. Returned by both click()
+# Compact descriptor of the focused element. Returned by both click()
 # and click_coordinate() so the agent can verify it focused what it
-# intended, then decide whether to follow up with browser_type_focused(text=...).
-# Keeping this as a single shared string avoids drift
-# between the two click paths.
+# intended. When the outer document's activeElement is an <iframe>,
+# we recurse into the iframe's document (same-origin only) so the
+# response describes the real inner element — otherwise the agent
+# always sees {tag: "iframe"} and can't tell whether it hit the
+# composer or something else inside the frame (e.g. a sidebar item
+# in LinkedIn's #interop-outlet messaging overlay).
 _FOCUSED_ELEMENT_JS = """
 (function() {
+    function describe(el) {
+        var rect = el.getBoundingClientRect();
+        var attrs = {};
+        for (var i = 0; i < el.attributes.length && i < 10; i++) {
+            attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
+        }
+        return {
+            tag: el.tagName.toLowerCase(),
+            id: el.id || null,
+            className: el.className || null,
+            name: el.getAttribute('name') || null,
+            type: el.getAttribute('type') || null,
+            role: el.getAttribute('role') || null,
+            contenteditable: el.getAttribute('contenteditable') || null,
+            text: (el.innerText || '').substring(0, 200),
+            value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
+            attributes: attrs,
+            rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
+        };
+    }
     var el = document.activeElement;
     if (!el || el === document.body) return null;
-    var rect = el.getBoundingClientRect();
-    var attrs = {};
-    for (var i = 0; i < el.attributes.length && i < 10; i++) {
-        attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
+    // Descend into same-origin iframes. Capped at 5 levels of
+    // nesting to bound cost. Cross-origin frames throw on
+    // contentDocument access → we catch and report the outermost
+    // iframe instead.
+    var framePath = [];
+    var depth = 0;
+    while (el && (el.tagName === 'IFRAME' || el.tagName === 'FRAME') && depth < 5) {
+        framePath.push(el.id || el.getAttribute('data-testid') || el.tagName.toLowerCase());
+        var innerDoc = null;
+        try { innerDoc = el.contentDocument; } catch (e) { innerDoc = null; }
+        if (!innerDoc) break;
+        var innerActive = innerDoc.activeElement;
+        if (!innerActive || innerActive === innerDoc.body) break;
+        el = innerActive;
+        depth++;
     }
-    return {
-        tag: el.tagName.toLowerCase(),
-        id: el.id || null,
-        className: el.className || null,
-        name: el.getAttribute('name') || null,
-        type: el.getAttribute('type') || null,
-        role: el.getAttribute('role') || null,
-        contenteditable: el.getAttribute('contenteditable') || null,
-        text: (el.innerText || '').substring(0, 200),
-        value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
-        attributes: attrs,
-        rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
-    };
+    var out = describe(el);
+    if (framePath.length) out.inFrame = framePath;
+    return out;
 })()
 """
 
@@ -937,16 +961,33 @@ class BeelineBridge:
     async def _read_focused_element(self, tab_id: int) -> dict | None:
         """Read document.activeElement and return a compact descriptor.
 
-        Returns None on any failure — never raises. Used by both click
-        paths (selector-based click() and click_coordinate()) so the
-        agent gets the same response shape regardless of which one was
-        called. The descriptor lets the agent answer "did my click land
-        on an editable?" without a second round-trip.
+        The JS returns ``rect`` fields in CSS px (they come straight
+        from ``getBoundingClientRect``). We scale them to screenshot
+        pixels here so the agent sees a rect in the same coord space
+        it passed to click / hover / press_at.
+
+        Returns None on any failure — never raises.
         """
         try:
             await self._try_enable_domain(tab_id, "Runtime")
             result = await self.evaluate(tab_id, _FOCUSED_ELEMENT_JS)
-            return (result or {}).get("result")
+            info = (result or {}).get("result")
+            if info and isinstance(info, dict) and isinstance(info.get("rect"), dict):
+                # Convert CSS px rect → screenshot px using the cached
+                # scale. Fall back to 1.0 if no screenshot has been
+                # taken yet on this tab.
+                from .tools.inspection import _screenshot_css_scales
+
+                scale = _screenshot_css_scales.get(tab_id, 1.0) or 1.0
+                if scale > 0 and scale != 1.0:
+                    r = info["rect"]
+                    info["rect"] = {
+                        "x": round(r.get("x", 0) / scale, 1),
+                        "y": round(r.get("y", 0) / scale, 1),
+                        "width": round(r.get("width", 0) / scale, 1),
+                        "height": round(r.get("height", 0) / scale, 1),
+                    }
+            return info
         except Exception:
             return None
 
@@ -959,18 +1000,11 @@ class BeelineBridge:
         button_map = {"left": "left", "right": "right", "middle": "middle"}
         cdp_button = button_map.get(button, "left")
 
-        from .tools.inspection import _screenshot_css_scales, _screenshot_scales
-
-        phys_scale = _screenshot_scales.get(tab_id, "unset")
-        css_scale = _screenshot_css_scales.get(tab_id, "unset")
         logger.info(
-            "click_coordinate tab=%d: x=%.1f, y=%.1f → CDP Input.dispatchMouseEvent. "
-            "stored_scales: physicalScale=%s, cssScale=%s",
+            "click_coordinate tab=%d: x=%.1f, y=%.1f → CDP Input.dispatchMouseEvent",
             tab_id,
             x,
             y,
-            phys_scale,
-            css_scale,
         )
 
         await self._cdp(
diff --git a/tools/src/gcu/browser/tools/advanced.py b/tools/src/gcu/browser/tools/advanced.py
index 2b929804..4c206263 100644
--- a/tools/src/gcu/browser/tools/advanced.py
+++ b/tools/src/gcu/browser/tools/advanced.py
@@ -255,6 +255,16 @@ def register_advanced_tools(mcp: FastMCP) -> None:
 
         try:
             result = await bridge.resize(target_tab, width, height)
+            # Invalidate per-tab scale caches — CSS width changed, so the
+            # cached image→CSS multiplier is stale. Click / rect tools
+            # will re-query innerWidth on next use via _ensure_css_scale.
+            try:
+                from .inspection import _screenshot_css_scales, _screenshot_scales
+
+                _screenshot_css_scales.pop(target_tab, None)
+                _screenshot_scales.pop(target_tab, None)
+            except Exception:
+                pass
             return result
         except Exception as e:
             return {"ok": False, "error": str(e)}
diff --git a/tools/src/gcu/browser/tools/inspection.py b/tools/src/gcu/browser/tools/inspection.py
index a05058e0..6a21aded 100644
--- a/tools/src/gcu/browser/tools/inspection.py
+++ b/tools/src/gcu/browser/tools/inspection.py
@@ -23,12 +23,21 @@ from .tabs import _get_context
 
 logger = logging.getLogger(__name__)
 
-# Target width for normalized screenshots (px in the delivered image)
-_SCREENSHOT_WIDTH = 600
 
-# Maps tab_id -> physical scale: image_coord × scale = physical pixels (for CDP Input events)
+# Fixed output width for all screenshots. Chosen well below Anthropic's
+# ~1568-px vision-API resize threshold so the image the server emits is
+# the SAME image (pixel-for-pixel) the LLM sees. That preserves
+# image_px == model_px, which is the cornerstone of the "LLM works in
+# screenshot pixels only" contract — all click/hover/press/rect tools
+# translate between image pixels and CSS pixels internally.
+_SCREENSHOT_WIDTH = 800
+
+# Per-tab scale caches populated on every browser_screenshot and on
+# lazy-init inside the click tools. Both are ``image_px × scale =
+# target_px`` multipliers.
+# - _screenshot_scales[tab]      → physical scale (image → physical px, debug only)
+# - _screenshot_css_scales[tab]  → css scale      (image → CSS px, used for Input events)
 _screenshot_scales: dict[int, float] = {}
-# Maps tab_id -> CSS scale: image_coord × scale = CSS pixels (for DOM APIs / getBoundingClientRect)
 _screenshot_css_scales: dict[int, float] = {}
 
 
@@ -37,18 +46,28 @@ def _resize_and_annotate(
     css_width: int,
     dpr: float = 1.0,
     highlights: list[dict] | None = None,
-    width: int = _SCREENSHOT_WIDTH,
 ) -> tuple[str, float, float]:
-    """Resize a base64 PNG to _SCREENSHOT_WIDTH wide, annotate highlights.
+    """Resize the captured PNG down to ``_SCREENSHOT_WIDTH`` (=800 px)
+    and re-encode as JPEG quality 75.
 
-    Returns (new_b64, physical_scale, css_scale) where:
-      physical_scale = physical_px_per_image_px  (multiply image coords → physical px)
-      css_scale      = css_px_per_image_px        (multiply image coords → CSS px for DOM APIs)
+    CDP captures at the physical-pixel resolution (DPR × CSS). We
+    downscale to 800 px wide so the delivered image stays under
+    Anthropic's vision-API resize cap — the model sees pixel-for-pixel
+    what we send.
 
-    Highlights have x,y,w,h in CSS pixels (what getBoundingClientRect returns,
-    and what CDP Input.dispatchMouseEvent accepts).
-    Falls back to original data if Pillow unavailable or resize fails.
+    Returns ``(new_b64, physical_scale, css_scale)`` where
+    - ``physical_scale = orig_png_w / _SCREENSHOT_WIDTH`` (image → physical px)
+    - ``css_scale      = css_width / _SCREENSHOT_WIDTH`` (image → CSS px)
+
+    Highlight rects arrive in CSS px and are divided by ``css_scale``
+    before drawing so overlays land in the correct spot on the
+    800-wide output.
     """
+    if not css_width or css_width <= 0:
+        # Bridge always supplies css_width from window.innerWidth; only
+        # reach here on a degraded response. Return the raw PNG.
+        return data, 1.0, 1.0
+
     try:
         from PIL import Image, ImageDraw, ImageFont
     except ImportError:
@@ -58,21 +77,16 @@ def _resize_and_annotate(
             import struct
 
             orig_w = struct.unpack(">I", raw[16:20])[0]
-        raw_size_bytes = len(raw)
-        physical_scale = orig_w / width if orig_w and width else 1.0
-        css_scale = (css_width / width) if css_width else (physical_scale / max(dpr, 1.0))
+        physical_scale = orig_w / _SCREENSHOT_WIDTH if orig_w else 1.0
+        css_scale = css_width / _SCREENSHOT_WIDTH
         logger.warning(
-            "PIL not available — screenshot resize SKIPPED (cannot downscale image). "
-            "raw_size=%d bytes, png_width=%d, css_width=%s, dpr=%s, target_width=%d. "
-            "Returning ORIGINAL image with computed scales: physicalScale=%.4f, cssScale=%.4f. "
-            "Agent must use browser_coords() to convert image positions before clicking.",
-            raw_size_bytes,
-            orig_w,
-            css_width,
-            dpr,
-            width,
+            "PIL not available — screenshot resize SKIPPED. "
+            "Returning raw physical-px PNG. physicalScale=%.4f, "
+            "cssScale=%.4f, css_width=%d, dpr=%s. Install Pillow for correct clicks.",
             physical_scale,
             css_scale,
+            css_width,
+            dpr,
         )
         return data, round(physical_scale, 4), round(css_scale, 4)
 
@@ -81,25 +95,25 @@ def _resize_and_annotate(
         img = Image.open(io.BytesIO(raw)).convert("RGBA")
         orig_w, orig_h = img.size
 
-        physical_scale = orig_w / width
-        css_scale = (css_width / width) if css_width else (physical_scale / max(dpr, 1.0))
+        physical_scale = orig_w / _SCREENSHOT_WIDTH
+        css_scale = css_width / _SCREENSHOT_WIDTH
+        new_w = _SCREENSHOT_WIDTH
+        new_h = round(orig_h * new_w / orig_w)
+        if (new_w, new_h) != img.size:
+            img = img.resize((new_w, new_h), Image.LANCZOS)
 
         logger.info(
-            "Screenshot resize: orig=%dx%d → target=%dx%d, css_width=%s, dpr=%s, physicalScale=%.4f, cssScale=%.4f",
+            "Screenshot: orig=%dx%d → out=%dx%d (css_width=%d, dpr=%s), physicalScale=%.4f, cssScale=%.4f",
             orig_w,
             orig_h,
-            width,
-            round(orig_h * width / orig_w),
+            new_w,
+            new_h,
             css_width,
             dpr,
             physical_scale,
             css_scale,
         )
 
-        new_w = width
-        new_h = round(orig_h * new_w / orig_w)
-        img = img.resize((new_w, new_h), Image.LANCZOS)
-
         if highlights:
             overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
             draw = ImageDraw.Draw(overlay)
@@ -111,7 +125,7 @@ def _resize_and_annotate(
             for h in highlights:
                 kind = h.get("kind", "rect")
                 label = h.get("label", "")
-                # Highlights are in CSS px → convert to image px
+                # Highlights arrive in CSS px → convert to image px.
                 ix = h["x"] / css_scale
                 iy = h["y"] / css_scale
                 iw = h.get("w", 0) / css_scale
@@ -135,11 +149,9 @@ def _resize_and_annotate(
                         width=2,
                     )
 
-                # Label: show image pixel position so user knows where to look
-                img_coords = f"img:({round(ix)},{round(iy)})"
-                display_label = f"{img_coords} {label}" if label else img_coords
+                display_label = f"({round(ix)},{round(iy)}) {label}".strip()
                 lx, ly = ix, max(2, iy - 16)
-                lx = max(2, min(lx, width - 120))
+                lx = max(2, min(lx, new_w - 120))
                 bbox = draw.textbbox((lx, ly), display_label, font=font)
                 pad = 3
                 draw.rectangle(
@@ -153,7 +165,7 @@ def _resize_and_annotate(
             img = img.convert("RGB")
 
         buf = io.BytesIO()
-        img.save(buf, format="PNG", optimize=True)
+        img.save(buf, format="JPEG", quality=75, optimize=True)
         return (
             base64.b64encode(buf.getvalue()).decode(),
             round(physical_scale, 4),
@@ -161,16 +173,38 @@ def _resize_and_annotate(
         )
     except Exception:
         logger.warning(
-            "Screenshot resize/annotate FAILED — returning original image with scale=1.0. "
-            "css_width=%s, dpr=%s, target_width=%d. Clicks will be misaligned.",
+            "Screenshot resize/annotate FAILED — returning original image. "
+            "css_width=%s, dpr=%s.",
             css_width,
             dpr,
-            width,
             exc_info=True,
         )
         return data, 1.0, 1.0
 
 
+async def _ensure_css_scale(tab_id: int) -> float:
+    """Return the image→CSS scale for ``tab_id``, populating the cache
+    via ``window.innerWidth`` if missing. Used by click tools when the
+    agent clicks before the first screenshot has been taken.
+    """
+    cached = _screenshot_css_scales.get(tab_id)
+    if cached is not None and cached > 0:
+        return cached
+    bridge = get_bridge()
+    try:
+        result = await bridge.evaluate(tab_id, "({w: window.innerWidth})")
+        inner = float(((result or {}).get("result") or {}).get("w") or 0)
+    except Exception:
+        inner = 0.0
+    if inner <= 0:
+        # Degraded: no viewport width available. Treat image px as CSS px.
+        scale = 1.0
+    else:
+        scale = inner / _SCREENSHOT_WIDTH
+    _screenshot_css_scales[tab_id] = scale
+    return scale
+
+
 def register_inspection_tools(mcp: FastMCP) -> None:
     """Register browser inspection tools."""
 
@@ -180,26 +214,24 @@ def register_inspection_tools(mcp: FastMCP) -> None:
         profile: str | None = None,
         full_page: bool = False,
         selector: str | None = None,
-        image_type: Literal["png", "jpeg"] = "png",
         annotate: bool = True,
-        width: int = _SCREENSHOT_WIDTH,
     ) -> list:
         """
         Take a screenshot of the current page.
 
-        Returns a normalized image alongside text metadata (URL, size, scale
-        factors, etc.). Automatically annotates the last interaction (click,
-        hover, type) with a bounding box overlay.
+        Image is 800 px wide (JPEG quality 75, ~50–120 KB). A pixel you
+        see in this image is the same number you pass to
+        ``browser_click_coordinate`` / ``browser_hover_coordinate`` /
+        ``browser_press_at`` — the tools translate to CSS internally.
+        ``browser_get_rect`` and ``browser_shadow_query`` likewise
+        return coordinates in screenshot pixels.
 
         Args:
             tab_id: Chrome tab ID (default: active tab)
             profile: Browser profile name (default: "default")
             full_page: Capture full scrollable page (default: False)
             selector: CSS selector to screenshot a specific element (optional)
-            image_type: Image format - png or jpeg (default: png)
             annotate: Draw bounding box of last interaction on image (default: True)
-            width: Output image width in pixels (default: 600). Use 800+ for fine
-                   text, 400 for quick layout checks.
 
         Returns:
             List of content blocks: text metadata + image
@@ -252,7 +284,6 @@ def register_inspection_tools(mcp: FastMCP) -> None:
                 return [TextContent(type="text", text=json.dumps(screenshot_result))]
 
             data = screenshot_result.get("data")
-            mime_type = screenshot_result.get("mimeType", "image/png")
             css_width = screenshot_result.get("cssWidth", 0)
             dpr = screenshot_result.get("devicePixelRatio", 1.0)
 
@@ -263,45 +294,45 @@ def register_inspection_tools(mcp: FastMCP) -> None:
             if annotate and target_tab in _interaction_highlights:
                 highlights = [_interaction_highlights[target_tab]]
 
-            # Normalize to 800px wide and annotate. Offloaded to a
-            # thread because PIL Image.open/resize/ImageDraw/composite on
-            # a 2-megapixel PNG blocks for ~150-300ms of CPU — plenty to
-            # freeze the asyncio event loop and delay every concurrent
-            # tool call during a screenshot. The function is reentrant
-            # (fresh PIL Image per call, no shared state), so to_thread
-            # is safe.
+            # Resize to CSS-viewport dimensions (image px == CSS px)
+            # and re-encode as JPEG. Offloaded to a thread because PIL
+            # Image.open/resize/ImageDraw/composite on a 2-megapixel
+            # PNG blocks for ~150–300 ms of CPU — plenty to freeze the
+            # asyncio event loop. Reentrant: no shared state.
             data, physical_scale, css_scale = await asyncio.to_thread(
                 _resize_and_annotate,
                 data,
                 css_width,
                 dpr,
                 highlights,
-                width,
             )
-            _screenshot_scales[target_tab] = physical_scale
-            _screenshot_css_scales[target_tab] = css_scale
+            # Refresh caches so click / hover / press / rect tools can
+            # translate image px ↔ CSS px without asking the page again.
+            if target_tab is not None:
+                _screenshot_scales[target_tab] = physical_scale
+                _screenshot_css_scales[target_tab] = css_scale
 
             meta = json.dumps(
                 {
                     "ok": True,
                     "tabId": target_tab,
                     "url": screenshot_result.get("url", ""),
-                    "imageType": mime_type.split("/")[-1],
+                    "imageType": "jpeg",
                     "size": len(base64.b64decode(data)) if data else 0,
-                    "imageWidth": width,
+                    "imageWidth": _SCREENSHOT_WIDTH,
+                    "cssWidth": css_width,
                     "fullPage": full_page,
                     "devicePixelRatio": dpr,
                     "physicalScale": physical_scale,
                     "cssScale": css_scale,
                     "annotated": bool(highlights),
                     "scaleHint": (
-                        f"image_coord × {css_scale} = CSS px "
-                        f"→ feed to browser_click_coordinate, "
-                        f"browser_hover_coordinate, browser_press_at "
-                        f"(CDP Input events use CSS pixels). "
-                        f"image_coord × {physical_scale} = physical px "
-                        f"is debug-only on HiDPI displays and must NOT "
-                        f"be used for clicks — it overshoots by DPR×."
+                        "Image is 800 px wide. Pass pixel coordinates "
+                        "you read off this image straight into "
+                        "browser_click_coordinate / "
+                        "browser_hover_coordinate / browser_press_at — "
+                        "the tools translate image px → CSS px "
+                        "internally (cssScale is for debug only)."
                     ),
                 }
             )
@@ -313,17 +344,17 @@ def register_inspection_tools(mcp: FastMCP) -> None:
                     "ok": True,
                     "size": len(base64.b64decode(data)) if data else 0,
                     "url": screenshot_result.get("url", ""),
-                    "physicalScale": physical_scale,
+                    "cssWidth": css_width,
                     "cssScale": css_scale,
-                    "debug_cssWidth": css_width,
-                    "debug_dpr": dpr,
+                    "physicalScale": physical_scale,
+                    "dpr": dpr,
                 },
                 duration_ms=(time.perf_counter() - start) * 1000,
             )
 
             return [
                 TextContent(type="text", text=meta),
-                ImageContent(type="image", data=data, mimeType=mime_type),
+                ImageContent(type="image", data=data, mimeType="image/jpeg"),
             ]
         except Exception as e:
             log_tool_call(
@@ -334,73 +365,6 @@ def register_inspection_tools(mcp: FastMCP) -> None:
             )
             return [TextContent(type="text", text=json.dumps({"ok": False, "error": str(e)}))]
 
-    @mcp.tool()
-    def browser_coords(
-        x: float,
-        y: float,
-        tab_id: int | None = None,
-        profile: str | None = None,
-    ) -> dict:
-        """
-        Convert screenshot image coordinates to browser click coordinates.
-
-        After browser_screenshot returns a downscaled image, use this to
-        translate pixel positions you see in the image into the CSS pixel
-        coordinates that Chrome DevTools Protocol expects.
-
-        **CDP Input.dispatchMouseEvent uses CSS pixels**, so you want
-        ``css_x`` / ``css_y`` for every click/hover tool. ``physical_x/y``
-        is kept in the return for debugging on HiDPI displays — do NOT
-        feed it to clicks; on a DPR=2 screen it lands 2× too far.
-
-        Edge case: pages using ``zoom`` or ``transform: scale()`` (e.g.
-        LinkedIn's ``#interop-outlet`` shadow DOM) render in a scaled
-        local coordinate space. For those, ``getBoundingClientRect()``
-        reports pre-zoom coordinates and you may still need to multiply
-        by the element's effective zoom. Use browser_shadow_query to
-        get the zoomed rect directly.
-
-        Args:
-            x: X pixel position in the screenshot image
-            y: Y pixel position in the screenshot image
-            tab_id: Chrome tab ID (default: active tab for profile)
-            profile: Browser profile name (default: "default")
-
-        Returns:
-            Dict with css_x, css_y (primary — use these), physical_x,
-            physical_y (debug only), and scale factors.
-        """
-        ctx = _get_context(profile)
-        target_tab = tab_id or (ctx.get("activeTabId") if ctx else None)
-
-        physical_scale = _screenshot_scales.get(target_tab, 1.0) if target_tab else 1.0
-        # css_scale stored in second slot via _screenshot_css_scales
-        css_scale = _screenshot_css_scales.get(target_tab, physical_scale) if target_tab else physical_scale
-
-        return {
-            "ok": True,
-            # Primary output: CSS pixels. Feed these to click/hover/press.
-            "css_x": round(x * css_scale, 1),
-            "css_y": round(y * css_scale, 1),
-            # Debug output: raw physical pixels. DO NOT feed to clicks on
-            # HiDPI displays — CDP Input events use CSS pixels, so sending
-            # physical coordinates lands the click at roughly DPR× the
-            # intended position.
-            "physical_x": round(x * physical_scale, 1),
-            "physical_y": round(y * physical_scale, 1),
-            "physicalScale": physical_scale,
-            "cssScale": css_scale,
-            "tabId": target_tab,
-            "note": (
-                "Use css_x/css_y with browser_click_coordinate, "
-                "browser_hover_coordinate, browser_press_at — "
-                "Chrome DevTools Protocol Input.dispatchMouseEvent "
-                "operates in CSS pixels. physical_x/y is for debugging "
-                "on HiDPI displays only; feeding it to clicks lands "
-                "them at DPR× the intended coordinate."
-            ),
-        }
-
     @mcp.tool()
     async def browser_shadow_query(
         selector: str,
@@ -412,7 +376,9 @@ def register_inspection_tools(mcp: FastMCP) -> None:
 
         Traverses shadow roots to find elements inside closed/open shadow DOM,
         overlays, and virtual-rendered components (e.g. LinkedIn's #interop-outlet).
-        Returns getBoundingClientRect in both CSS and physical pixels.
+        Returns the element's bounding rect in screenshot pixels — feed
+        ``rect.cx`` / ``rect.cy`` straight into browser_click_coordinate
+        / hover_coordinate / press_at.
 
         Args:
             selector: CSS selectors joined by ' >>> ' to pierce shadow roots.
@@ -421,7 +387,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
             profile: Browser profile name (default: "default")
 
         Returns:
-            Dict with rect (CSS px) and physical rect (CSS px × DPR) of the element
+            Dict with ``rect`` block (x, y, w, h, cx, cy) in screenshot pixels.
         """
         bridge = get_bridge()
         if not bridge or not bridge.is_connected:
@@ -438,36 +404,24 @@ def register_inspection_tools(mcp: FastMCP) -> None:
             return result
 
         rect = result["rect"]
-        physical_scale = _screenshot_scales.get(target_tab, 1.0)
-        css_scale = _screenshot_css_scales.get(target_tab, 1.0)
-        dpr = physical_scale / css_scale if css_scale else 1.0
-
+        css_scale = await _ensure_css_scale(target_tab)
+        s = css_scale if css_scale > 0 else 1.0
         return {
             "ok": True,
             "selector": selector,
             "tag": rect.get("tag"),
-            "css": {
-                "x": rect["x"],
-                "y": rect["y"],
-                "w": rect["w"],
-                "h": rect["h"],
-                "cx": rect["cx"],
-                "cy": rect["cy"],
-            },
-            "physical": {
-                "x": round(rect["x"] * dpr, 1),
-                "y": round(rect["y"] * dpr, 1),
-                "w": round(rect["w"] * dpr, 1),
-                "h": round(rect["h"] * dpr, 1),
-                "cx": round(rect["cx"] * dpr, 1),
-                "cy": round(rect["cy"] * dpr, 1),
+            "rect": {
+                "x": round(rect["x"] / s, 1),
+                "y": round(rect["y"] / s, 1),
+                "w": round(rect["w"] / s, 1),
+                "h": round(rect["h"] / s, 1),
+                "cx": round(rect["cx"] / s, 1),
+                "cy": round(rect["cy"] / s, 1),
             },
             "note": (
-                "Use css.cx/cy with browser_click_coordinate, "
-                "browser_hover_coordinate, browser_press_at — "
-                "CDP Input events operate in CSS pixels. "
-                "physical.* is debug-only; feeding it to clicks "
-                "lands them DPR× too far on HiDPI displays."
+                "rect fields are in screenshot pixels. Pass rect.cx / "
+                "rect.cy to browser_click_coordinate / "
+                "hover_coordinate / press_at."
             ),
         }
 
@@ -480,11 +434,10 @@ def register_inspection_tools(mcp: FastMCP) -> None:
         """
         Get the bounding rect of an element by CSS selector.
 
-        Supports '>>>' shadow-piercing selectors for overlay/shadow DOM content.
-        Returns coordinates in CSS pixels (for clicks and DOM APIs); the
-        physical-pixel variant is returned for debugging on HiDPI displays
-        only — it must not be fed to click/hover/press tools, which use
-        CSS pixels.
+        Supports '>>>' shadow-piercing selectors for overlay/shadow DOM
+        content. Returns the rect in screenshot pixels — the same
+        numbers you'd read off a browser_screenshot, and the same
+        numbers browser_click_coordinate expects.
 
         Args:
             selector: CSS selector, optionally with ' >>> ' to pierce shadow roots.
@@ -493,7 +446,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
             profile: Browser profile name (default: "default")
 
         Returns:
-            Dict with css and physical bounding rects
+            Dict with ``rect`` block (x, y, w, h, cx, cy) in screenshot pixels.
         """
         bridge = get_bridge()
         if not bridge or not bridge.is_connected:
@@ -510,36 +463,24 @@ def register_inspection_tools(mcp: FastMCP) -> None:
             return result
 
         rect = result["rect"]
-        physical_scale = _screenshot_scales.get(target_tab, 1.0)
-        css_scale = _screenshot_css_scales.get(target_tab, 1.0)
-        dpr = physical_scale / css_scale if css_scale else 1.0
-
+        css_scale = await _ensure_css_scale(target_tab)
+        s = css_scale if css_scale > 0 else 1.0
         return {
             "ok": True,
             "selector": selector,
             "tag": rect.get("tag"),
-            "css": {
-                "x": rect["x"],
-                "y": rect["y"],
-                "w": rect["w"],
-                "h": rect["h"],
-                "cx": rect["cx"],
-                "cy": rect["cy"],
-            },
-            "physical": {
-                "x": round(rect["x"] * dpr, 1),
-                "y": round(rect["y"] * dpr, 1),
-                "w": round(rect["w"] * dpr, 1),
-                "h": round(rect["h"] * dpr, 1),
-                "cx": round(rect["cx"] * dpr, 1),
-                "cy": round(rect["cy"] * dpr, 1),
+            "rect": {
+                "x": round(rect["x"] / s, 1),
+                "y": round(rect["y"] / s, 1),
+                "w": round(rect["w"] / s, 1),
+                "h": round(rect["h"] / s, 1),
+                "cx": round(rect["cx"] / s, 1),
+                "cy": round(rect["cy"] / s, 1),
             },
             "note": (
-                "Use css.cx/cy with browser_click_coordinate, "
-                "browser_hover_coordinate, browser_press_at — "
-                "CDP Input events operate in CSS pixels. "
-                "physical.* is debug-only; feeding it to clicks "
-                "lands them DPR× too far on HiDPI displays."
+                "rect fields are in screenshot pixels. Pass rect.cx / "
+                "rect.cy to browser_click_coordinate / "
+                "hover_coordinate / press_at."
             ),
         }
 
diff --git a/tools/src/gcu/browser/tools/interactions.py b/tools/src/gcu/browser/tools/interactions.py
index b5daf70f..ee5596fc 100644
--- a/tools/src/gcu/browser/tools/interactions.py
+++ b/tools/src/gcu/browser/tools/interactions.py
@@ -108,24 +108,25 @@ def register_interaction_tools(mcp: FastMCP) -> None:
         button: Literal["left", "right", "middle"] = "left",
     ) -> dict:
         """
-        Click at specific viewport coordinates (CSS pixels).
+        Click at the given SCREENSHOT pixel.
 
-        Chrome DevTools Protocol's Input.dispatchMouseEvent operates in
-        **CSS pixels**, not physical pixels. If you have a screenshot
-        image coordinate, convert it with ``browser_coords(x, y)`` and
-        use the returned ``css_x`` / ``css_y`` — not ``physical_x/y``.
-        On a DPR=2 display, feeding physical coordinates lands the click
-        at 2× the intended position.
+        ``x`` and ``y`` are pixel coordinates read directly off a
+        ``browser_screenshot`` image (800 px wide JPEG). The tool
+        multiplies them by the cached image→CSS scale for the tab
+        before dispatching to Chrome — no scale awareness required on
+        the caller side. ``browser_get_rect`` / ``browser_shadow_query``
+        return coordinates in the same (screenshot) space.
 
         Args:
-            x: X coordinate in CSS pixels (viewport space)
-            y: Y coordinate in CSS pixels (viewport space)
+            x: X coordinate in screenshot pixels.
+            y: Y coordinate in screenshot pixels.
             tab_id: Chrome tab ID (default: active tab)
             profile: Browser profile name (default: "default")
             button: Mouse button to click (left, right, middle)
 
         Returns:
-            Dict with click result
+            Dict with click result, including ``focused_element``
+            describing what the click focused.
         """
         start = time.perf_counter()
         params = {"x": x, "y": y, "tab_id": tab_id, "profile": profile, "button": button}
@@ -149,17 +150,17 @@ def register_interaction_tools(mcp: FastMCP) -> None:
             return result
 
         try:
-            from .inspection import _screenshot_css_scales, _screenshot_scales
+            from .inspection import _ensure_css_scale
 
-            click_result = await bridge.click_coordinate(target_tab, x, y, button=button)
+            css_scale = await _ensure_css_scale(target_tab)
+            s = css_scale if css_scale > 0 else 1.0
+            css_x = x * s
+            css_y = y * s
+            click_result = await bridge.click_coordinate(target_tab, css_x, css_y, button=button)
             log_tool_call(
                 "browser_click_coordinate",
                 params,
-                result={
-                    **click_result,
-                    "debug_stored_physicalScale": _screenshot_scales.get(target_tab, "unset"),
-                    "debug_stored_cssScale": _screenshot_css_scales.get(target_tab, "unset"),
-                },
+                result={**click_result, "cssScale": round(css_scale, 4)},
                 duration_ms=(time.perf_counter() - start) * 1000,
             )
             return click_result
@@ -484,15 +485,17 @@ def register_interaction_tools(mcp: FastMCP) -> None:
         profile: str | None = None,
     ) -> dict:
         """
-        Hover at CSS pixel coordinates without needing a CSS selector.
+        Hover at the given SCREENSHOT pixel.
 
         Use this instead of browser_hover when the element is in an overlay,
         shadow DOM, or virtual-rendered component that isn't in the regular DOM.
-        Pair with browser_coords to convert screenshot image positions to CSS pixels.
+        ``x`` / ``y`` are pixel coordinates read directly off a
+        ``browser_screenshot`` image; the tool translates to CSS px
+        internally before dispatching to Chrome.
 
         Args:
-            x: CSS pixel X coordinate
-            y: CSS pixel Y coordinate
+            x: X coordinate in screenshot pixels.
+            y: Y coordinate in screenshot pixels.
             tab_id: Chrome tab ID (default: active tab)
             profile: Browser profile name (default: "default")
 
@@ -521,7 +524,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
             return result
 
         try:
-            hover_result = await bridge.hover_coordinate(target_tab, x, y)
+            from .inspection import _ensure_css_scale
+
+            css_scale = await _ensure_css_scale(target_tab)
+            s = css_scale if css_scale > 0 else 1.0
+            hover_result = await bridge.hover_coordinate(target_tab, x * s, y * s)
             log_tool_call(
                 "browser_hover_coordinate",
                 params,
@@ -548,16 +555,18 @@ def register_interaction_tools(mcp: FastMCP) -> None:
         profile: str | None = None,
     ) -> dict:
         """
-        Move mouse to CSS pixel coordinates then press a key.
+        Move mouse to the given SCREENSHOT pixel, then press a key.
 
         Use this instead of browser_press when the focused element is in an overlay
         or virtual-rendered component. Moving the mouse first routes the key event
         through native browser hit-testing instead of the DOM focus chain.
-        Pair with browser_coords to convert screenshot image positions to CSS pixels.
+        ``x`` / ``y`` are pixel coordinates read directly off a
+        ``browser_screenshot`` image; the tool translates to CSS px
+        internally.
 
         Args:
-            x: CSS pixel X coordinate to position mouse
-            y: CSS pixel Y coordinate to position mouse
+            x: X coordinate in screenshot pixels.
+            y: Y coordinate in screenshot pixels.
             key: Key to press (e.g. 'Enter', 'Space', 'Escape', 'ArrowDown')
             tab_id: Chrome tab ID (default: active tab)
             profile: Browser profile name (default: "default")
@@ -587,7 +596,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
             return result
 
         try:
-            press_result = await bridge.press_key_at(target_tab, x, y, key)
+            from .inspection import _ensure_css_scale
+
+            css_scale = await _ensure_css_scale(target_tab)
+            s = css_scale if css_scale > 0 else 1.0
+            press_result = await bridge.press_key_at(target_tab, x * s, y * s, key)
             log_tool_call(
                 "browser_press_at",
                 params,
diff --git a/tools/src/gcu/server.py b/tools/src/gcu/server.py
index e6b3d43b..05bcacad 100644
--- a/tools/src/gcu/server.py
+++ b/tools/src/gcu/server.py
@@ -139,7 +139,10 @@ def main() -> None:
         mcp.run(transport="stdio")
     else:
         logger.info(f"Starting GCU server on {args.host}:{args.port}")
-        mcp.run(transport="http", host=args.host, port=args.port)
+        # FastMCP.run() forwards kwargs to anyio.run() instead of the
+        # transport, which breaks host/port for SSE. Invoke run_async
+        # directly so the kwargs land on run_sse_async.
+        asyncio.run(mcp.run_async(transport="sse", host=args.host, port=args.port))
 
 
 if __name__ == "__main__":

From 558813e7fa17f7d5be90aa78a9ea5d1489cbfe5f Mon Sep 17 00:00:00 2001
From: Timothy <timothy@adenhq.com>
Date: Thu, 16 Apr 2026 22:36:41 -0700
Subject: [PATCH 2/2] feat: fraction-based visual clicks

---
 .../agents/queen/reference/gcu_guide.md       |   2 +-
 core/framework/orchestrator/gcu.py            |  23 +-
 .../browser-automation/SKILL.md               |  31 +--
 tools/src/gcu/browser/bridge.py               |  29 ++-
 tools/src/gcu/browser/tools/advanced.py       |   9 +-
 tools/src/gcu/browser/tools/inspection.py     | 229 ++++++++++--------
 tools/src/gcu/browser/tools/interactions.py   | 117 ++++++---
 7 files changed, 259 insertions(+), 181 deletions(-)

diff --git a/core/framework/agents/queen/reference/gcu_guide.md b/core/framework/agents/queen/reference/gcu_guide.md
index 33b7894c..2922f58e 100644
--- a/core/framework/agents/queen/reference/gcu_guide.md
+++ b/core/framework/agents/queen/reference/gcu_guide.md
@@ -39,7 +39,7 @@ Neither tool is "preferred" universally — they're for different jobs. Default
 
 ## Coordinate rule
 
-Every browser tool that takes or returns coordinates operates in **screenshot pixels** (the 800 px wide JPEG `browser_screenshot` delivers). Read a pixel off the image, pass it straight to `browser_click_coordinate` / `browser_hover_coordinate` / `browser_press_at`. `browser_get_rect` and `browser_shadow_query` return `rect.cx` / `rect.cy` in the same space. The tools translate to CSS px internally — no scale awareness required. Avoid raw `getBoundingClientRect()` via `browser_evaluate` for coord lookup; use `browser_get_rect` instead.
+Every browser tool that takes or returns coordinates operates in **fractions of the viewport (0..1 for both axes)**. Read a target's proportional position off `browser_screenshot` ("~35% from the left, ~20% from the top" → `(0.35, 0.20)`) and pass that to `browser_click_coordinate` / `browser_hover_coordinate` / `browser_press_at`. `browser_get_rect` and `browser_shadow_query` return `rect.cx` / `rect.cy` as fractions. The tools multiply by `cssWidth` / `cssHeight` internally — no scale awareness required. Fractions are used because every vision model (Claude, GPT-4o, Gemini, local VLMs) resizes/tiles images differently; proportions are invariant. Avoid raw `getBoundingClientRect()` via `browser_evaluate` for coord lookup; use `browser_get_rect` instead.
 
 ## System prompt tips for browser nodes
 
diff --git a/core/framework/orchestrator/gcu.py b/core/framework/orchestrator/gcu.py
index 6951ce61..1ac459cc 100644
--- a/core/framework/orchestrator/gcu.py
+++ b/core/framework/orchestrator/gcu.py
@@ -45,18 +45,23 @@ CSS selector.
 ## Coordinates
 
 Every browser tool that takes or returns coordinates operates in
-**screenshot pixels** (the 800 px wide JPEG `browser_screenshot`
-delivers). Read a pixel off the image, pass it to
-`browser_click_coordinate` / `browser_hover_coordinate` /
-`browser_press_at`. `browser_get_rect` and `browser_shadow_query`
-return `rect.cx` / `rect.cy` in the same space. The tools handle the
-image-px → CSS-px translation internally; you do not need to know
-about CSS pixels, DPR, or any scale factor.
+**fractions of the viewport (0..1 for both axes)**. Read a target's
+proportional position off `browser_screenshot` — "this button is
+~35% from the left, ~20% from the top" → pass `(0.35, 0.20)`.
+`browser_get_rect` and `browser_shadow_query` return `rect.cx` /
+`rect.cy` as fractions in the same space. The tools handle the
+fraction → CSS-px multiplication internally; you do not need to
+track image pixels, DPR, or any scale factor.
+
+Why fractions: every vision model (Claude, GPT-4o, Gemini, local
+VLMs) resizes or tiles images differently before the model sees the
+pixels. Proportions survive every such transform; pixel coordinates
+only "work" per-model and break when you swap backends.
 
 Avoid raw `browser_evaluate` + `getBoundingClientRect()` for coord
-lookup — that returns CSS px and will be mis-scaled when fed to click
+lookup — that returns CSS px and will be wrong when fed to click
 tools. Prefer `browser_get_rect` / `browser_shadow_query`, which
-convert for you.
+return fractions.
 
 ## Rich-text editors (X, LinkedIn DMs, Gmail, Reddit, Slack, Discord)
 
diff --git a/core/framework/skills/_default_skills/browser-automation/SKILL.md b/core/framework/skills/_default_skills/browser-automation/SKILL.md
index b3896f5b..311ac1d9 100644
--- a/core/framework/skills/_default_skills/browser-automation/SKILL.md
+++ b/core/framework/skills/_default_skills/browser-automation/SKILL.md
@@ -14,18 +14,20 @@ All GCU browser tools drive a real Chrome instance through the Beeline extension
 
 ## Coordinates
 
-Every browser tool that takes or returns coordinates operates in **screenshot pixels** (the 800 px wide JPEG `browser_screenshot` delivers). Take a screenshot, read a pixel off the image, pass that number to `browser_click_coordinate` / `browser_hover_coordinate` / `browser_press_at`. Rect-returning tools (`browser_get_rect`, `browser_shadow_query`, and the `rect` inside `focused_element`) also return screenshot pixels. You do not need to convert anything, track scale factors, or know about CSS pixels or device pixel ratio — the tools translate internally before dispatching to Chrome.
+Every browser tool that takes or returns coordinates operates in **fractions of the viewport (0..1 for both axes)**. Read a target's proportional position off `browser_screenshot` — "this button is about 35% from the left and 20% from the top" → pass `(0.35, 0.20)`. Rect-returning tools (`browser_get_rect`, `browser_shadow_query`, and the `rect` inside `focused_element`) also return fractions. The tools convert to CSS pixels internally before dispatching to Chrome.
 
 ```
-browser_screenshot()                  → image (800 px wide JPEG)
-browser_click_coordinate(x, y)        → x, y are screenshot px
-browser_hover_coordinate(x, y)        → x, y are screenshot px
-browser_press_at(x, y, key)           → x, y are screenshot px
-browser_get_rect(selector) → rect     → rect.cx / rect.cy are screenshot px
+browser_screenshot()                  → image + cssWidth/cssHeight in meta
+browser_click_coordinate(x, y)        → x, y are fractions 0..1
+browser_hover_coordinate(x, y)        → fractions
+browser_press_at(x, y, key)           → fractions
+browser_get_rect(selector) → rect     → rect.cx / rect.cy are fractions
 browser_shadow_query(...)  → rect     → same
 ```
 
-**Exception for zoomed elements:** pages that use `zoom` or `transform: scale()` on a container (LinkedIn's `#interop-outlet`, some embedded iframes) render in a scaled local coordinate space. `getBoundingClientRect` there may not match CDP's hit space. Prefer `browser_shadow_query` (which handles the math) or visually pick coordinates from a screenshot. When in doubt, avoid raw `browser_evaluate` + `getBoundingClientRect()` for coord lookup — that returns CSS px and will be mis-scaled when passed to click tools.
+**Why fractions:** every vision model (Claude ~1.15 MP target, GPT-4o 512-px tiles, Gemini, local VLMs) resizes or tiles images differently before the model sees the pixels. Proportions survive every such transform; pixel coordinates only "work" per-model and silently break when you swap backends. Four-decimal precision (`0.0001` ≈ 0.17 CSS px on a 1717-wide viewport) is more than enough for the tightest targets.
+
+**Exception for zoomed elements:** pages that use `zoom` or `transform: scale()` on a container (LinkedIn's `#interop-outlet`, some embedded iframes) render in a scaled local coordinate space. `getBoundingClientRect` there may not match CDP's hit space. Prefer `browser_shadow_query` (which handles the math and returns fractions) or visually pick coordinates from a screenshot. Avoid raw `browser_evaluate` + `getBoundingClientRect()` for coord lookup — that returns CSS px and will be wrong when fed to click tools.
 
 ## Screenshot + coordinates is shadow-agnostic — prefer it on shadow-heavy sites
 
@@ -41,9 +43,9 @@ Whereas `wait_for_selector`, `browser_click(selector=...)`, `browser_type(select
 
 ### Recommended workflow on shadow-heavy sites
 
-1. `browser_screenshot()` → 800 px wide JPEG.
-2. Identify the target visually → pixel `(x, y)` read straight off the image.
-3. `browser_click_coordinate(x, y)` → lands on the element via native hit testing; inputs get focused. **The response includes `focused_element: {tag, id, role, contenteditable, rect, inFrame?, ...}`** — use it to verify you actually focused what you intended. `rect` is in screenshot pixels (same space as the image). When focus is inside a same-origin iframe, the descriptor reports the inner element and adds `inFrame: [...]` breadcrumbs.
+1. `browser_screenshot()` → JPEG; meta includes `cssWidth`/`cssHeight` for reference.
+2. Identify the target visually → estimate its proportional position `(fx, fy)` where each is in `0..1`.
+3. `browser_click_coordinate(fx, fy)` → tool converts to CSS px and dispatches; CDP native hit testing focuses the element. **The response includes `focused_element: {tag, id, role, contenteditable, rect, inFrame?, ...}`** — use it to verify you actually focused what you intended. `rect` is in fractions (same space as your input). When focus is inside a same-origin iframe, the descriptor reports the inner element and adds `inFrame: [...]` breadcrumbs.
 4. `browser_type_focused(text="...")` → inserts text into `document.activeElement` (traverses into same-origin iframes automatically). Shadow roots, iframes, Lexical, Draft.js, ProseMirror all just work. Use `browser_type(selector, text)` instead when you have a reliable CSS selector for a light-DOM element.
 5. Verify via `browser_screenshot` OR `browser_get_attribute` on a known-reachable marker (e.g. check that the Send button's `aria-disabled` flipped to `false`).
 
@@ -74,7 +76,7 @@ browser_shadow_query("reddit-search-large >>> #search-input")
 browser_get_rect("#interop-outlet >>> #ember37 >>> p")
 ```
 
-Returns the element's rect in **screenshot pixels** (feed `rect.cx` / `rect.cy` directly to click tools). Remember: `browser_type` and `wait_for_selector` do **not** support `>>>` — only shadow_query and get_rect do.
+Returns the element's rect as **fractions of the viewport** (feed `rect.cx` / `rect.cy` directly to click tools). Remember: `browser_type` and `wait_for_selector` do **not** support `>>>` — only shadow_query and get_rect do.
 
 ## Navigation and waiting
 
@@ -215,11 +217,11 @@ Recognized without modifiers: `Enter`, `Tab`, `Escape`, `Backspace`, `Delete`, `
 
 ```
 browser_screenshot()                    # viewport, 800 px wide JPEG
-browser_screenshot(full_page=True)      # full scrollable page
+browser_screenshot(full_page=True)      # full scrollable page (overview only — don't click off a full-page shot)
 browser_screenshot(selector="#header")  # clip to element's rect
 ```
 
-Returns a JPEG (quality 75, ~50–120 KB) fixed at **800 px wide** — well below the vision-API resize threshold, so the model sees the exact pixels we emit. Metadata includes `imageWidth` (800), `cssWidth` (the page's real viewport width), `cssScale` (for debug only), and `physicalScale`. The image is annotated with a highlight rectangle/dot showing the last interaction (click, hover, type) if one happened on this tab.
+Returns a JPEG (quality 75, ~50–120 KB) at 800 px wide. The pixel width is purely a bandwidth choice; all tool coordinates are fractions of the viewport and are invariant to image size. Metadata includes `imageWidth` (800), `cssWidth`, `cssHeight` (for reference), and `physicalScale`. The image is annotated with a highlight rectangle/dot showing the last interaction (click, hover, type) if one happened on this tab.
 
 The highlight overlay stays visible on the page for **10 seconds** after each interaction, then fades. Before a screenshot is likely, make sure your click / hover / type happens <10 s before the screenshot.
 
@@ -347,7 +349,8 @@ Then pass the most specific selector that uniquely identifies the right input (e
 - **Typing into a rich-text editor without clicking first → send button stays disabled.** Draft.js (X), Lexical (Gmail, LinkedIn DMs), ProseMirror (Reddit), and React-controlled `contenteditable` elements only register input as "real" when the element received a native focus event — JS-sourced `.focus()` is not enough. `browser_type` now does this automatically via a real CDP pointer click before inserting text, but always verify the submit button's `disabled` state before clicking send. See the "ALWAYS click before typing" section above.
 - **Using per-character `keyDown` on Lexical / Draft.js editors → keys dispatch but text never appears.** Those editors intercept `beforeinput` and route insertion through their own state machine; raw keyDown events are silently dropped. `browser_type` now uses `Input.insertText` by default (the CDP IME-commit method) which these editors accept cleanly. Only set `use_insert_text=False` when you explicitly need per-keystroke dispatch.
 - **Leaving a composer with text then trying to navigate → `beforeunload` dialog hangs the bridge.** LinkedIn and several other sites pop a native "unsent message" confirm. `browser_navigate` and `close_tab` both time out against this. Always strip `window.onbeforeunload = null` via `browser_evaluate` before any navigation after typing in a composer, or wrap your logic in a `try/finally` that runs the cleanup block.
-- **Click landed in the wrong region (sidebar / header instead of target).** Check `focused_element` in the click response — it's ground truth for what actually got focused, including the `inFrame` breadcrumb when focus ends up inside a same-origin iframe. If it isn't the target (e.g. `className: "msg-conversation-listitem__link"` when you meant to hit a composer), adjust the pixel and retry. Coordinates you pass are screenshot pixels; the tool translates to CSS px internally, so a wrong result means you picked the wrong pixel off the image — not that any scale went sideways.
+- **Click landed in the wrong region (sidebar / header instead of target).** Check `focused_element` in the click response — it's ground truth for what actually got focused, including the `inFrame` breadcrumb when focus ends up inside a same-origin iframe. If it isn't the target (e.g. `className: "msg-conversation-listitem__link"` when you meant to hit a composer), adjust the fraction and retry. Coordinates you pass are fractions of the viewport; the tool multiplies by `cssWidth` / `cssHeight` internally, so a wrong result means your estimated proportion was off — not that any scale went sideways.
+- **Accidentally passing pixels to click / hover / press_at.** The tools reject any coord outside `[-0.1, 1.5]` with a clear error. If you see that error, you passed a pixel (like 815) instead of a fraction (like 0.475). Use `browser_get_rect` to get exact fractional cx/cy, or read proportions off `browser_screenshot`.
 - **Calling `wait_for_selector` on a shadow element.** It'll always time out. Use `browser_shadow_query` or the screenshot + coordinate strategy.
 - **Relying on `innerHTML` in injected scripts on LinkedIn.** Silently discarded. Use `createElement` + `appendChild`.
 - **Not waiting for SPA hydration.** `wait_until="load"` fires before React/Vue rendering on many sites. Add a 2–3 s sleep before querying for chrome elements.
diff --git a/tools/src/gcu/browser/bridge.py b/tools/src/gcu/browser/bridge.py
index 3f12159b..e6ed878d 100644
--- a/tools/src/gcu/browser/bridge.py
+++ b/tools/src/gcu/browser/bridge.py
@@ -962,9 +962,9 @@ class BeelineBridge:
         """Read document.activeElement and return a compact descriptor.
 
         The JS returns ``rect`` fields in CSS px (they come straight
-        from ``getBoundingClientRect``). We scale them to screenshot
-        pixels here so the agent sees a rect in the same coord space
-        it passed to click / hover / press_at.
+        from ``getBoundingClientRect``). We convert them to fractions
+        of the viewport here so the agent sees a rect in the same
+        coord space it passed to click / hover / press_at.
 
         Returns None on any failure — never raises.
         """
@@ -973,20 +973,23 @@ class BeelineBridge:
             result = await self.evaluate(tab_id, _FOCUSED_ELEMENT_JS)
             info = (result or {}).get("result")
             if info and isinstance(info, dict) and isinstance(info.get("rect"), dict):
-                # Convert CSS px rect → screenshot px using the cached
-                # scale. Fall back to 1.0 if no screenshot has been
-                # taken yet on this tab.
-                from .tools.inspection import _screenshot_css_scales
+                from .tools.inspection import _viewport_sizes
 
-                scale = _screenshot_css_scales.get(tab_id, 1.0) or 1.0
-                if scale > 0 and scale != 1.0:
+                vp = _viewport_sizes.get(tab_id)
+                if vp and vp[0] > 0 and vp[1] > 0:
+                    cw, ch = float(vp[0]), float(vp[1])
                     r = info["rect"]
                     info["rect"] = {
-                        "x": round(r.get("x", 0) / scale, 1),
-                        "y": round(r.get("y", 0) / scale, 1),
-                        "width": round(r.get("width", 0) / scale, 1),
-                        "height": round(r.get("height", 0) / scale, 1),
+                        "x": round(r.get("x", 0) / cw, 4),
+                        "y": round(r.get("y", 0) / ch, 4),
+                        "width": round(r.get("width", 0) / cw, 4),
+                        "height": round(r.get("height", 0) / ch, 4),
                     }
+                else:
+                    # Degraded: cache missing (no screenshot taken
+                    # yet). Leave rect in CSS px and flag it so the
+                    # agent can tell.
+                    info["rectSpace"] = "css"
             return info
         except Exception:
             return None
diff --git a/tools/src/gcu/browser/tools/advanced.py b/tools/src/gcu/browser/tools/advanced.py
index 4c206263..b825ec1b 100644
--- a/tools/src/gcu/browser/tools/advanced.py
+++ b/tools/src/gcu/browser/tools/advanced.py
@@ -256,12 +256,13 @@ def register_advanced_tools(mcp: FastMCP) -> None:
         try:
             result = await bridge.resize(target_tab, width, height)
             # Invalidate per-tab scale caches — CSS width changed, so the
-            # cached image→CSS multiplier is stale. Click / rect tools
-            # will re-query innerWidth on next use via _ensure_css_scale.
+            # cached viewport dimensions are stale. Click / rect tools
+            # will re-query innerWidth / innerHeight on next use via
+            # _ensure_viewport_size.
             try:
-                from .inspection import _screenshot_css_scales, _screenshot_scales
+                from .inspection import _screenshot_scales, _viewport_sizes
 
-                _screenshot_css_scales.pop(target_tab, None)
+                _viewport_sizes.pop(target_tab, None)
                 _screenshot_scales.pop(target_tab, None)
             except Exception:
                 pass
diff --git a/tools/src/gcu/browser/tools/inspection.py b/tools/src/gcu/browser/tools/inspection.py
index 6a21aded..98b05333 100644
--- a/tools/src/gcu/browser/tools/inspection.py
+++ b/tools/src/gcu/browser/tools/inspection.py
@@ -24,21 +24,25 @@ from .tabs import _get_context
 logger = logging.getLogger(__name__)
 
 
-# Fixed output width for all screenshots. Chosen well below Anthropic's
-# ~1568-px vision-API resize threshold so the image the server emits is
-# the SAME image (pixel-for-pixel) the LLM sees. That preserves
-# image_px == model_px, which is the cornerstone of the "LLM works in
-# screenshot pixels only" contract — all click/hover/press/rect tools
-# translate between image pixels and CSS pixels internally.
+# Fixed output width for all screenshots (bandwidth default). This
+# number does NOT affect coordinate semantics — click / hover / press
+# and rect tools all work in fractions of the viewport (0..1), which
+# are invariant to whatever resize / tile the vision API applies. The
+# 800 px width is simply small enough to keep JPEG payloads under
+# ~150 KB on typical UI screenshots.
 _SCREENSHOT_WIDTH = 800
 
-# Per-tab scale caches populated on every browser_screenshot and on
-# lazy-init inside the click tools. Both are ``image_px × scale =
-# target_px`` multipliers.
-# - _screenshot_scales[tab]      → physical scale (image → physical px, debug only)
-# - _screenshot_css_scales[tab]  → css scale      (image → CSS px, used for Input events)
+# Per-tab viewport-size cache populated on every browser_screenshot
+# and on lazy-init inside the click tools. Stores CSS-pixel viewport
+# dimensions (window.innerWidth / window.innerHeight). Click tools
+# multiply fractional inputs by these to get CSS coords before
+# dispatching CDP events; rect tools divide CSS-pixel DOM rects by
+# these to produce fractions for the agent.
+_viewport_sizes: dict[int, tuple[int, int]] = {}
+
+# Optional debug cache — physical-px scale per tab (orig_png_w /
+# _SCREENSHOT_WIDTH). Logged only; no consumer.
 _screenshot_scales: dict[int, float] = {}
-_screenshot_css_scales: dict[int, float] = {}
 
 
 def _resize_and_annotate(
@@ -46,27 +50,24 @@ def _resize_and_annotate(
     css_width: int,
     dpr: float = 1.0,
     highlights: list[dict] | None = None,
-) -> tuple[str, float, float]:
+) -> tuple[str, float]:
     """Resize the captured PNG down to ``_SCREENSHOT_WIDTH`` (=800 px)
     and re-encode as JPEG quality 75.
 
-    CDP captures at the physical-pixel resolution (DPR × CSS). We
-    downscale to 800 px wide so the delivered image stays under
-    Anthropic's vision-API resize cap — the model sees pixel-for-pixel
-    what we send.
+    The image dimensions do NOT determine click coordinates any more —
+    the tools work in viewport fractions. This helper exists purely
+    for bandwidth + annotation overlay. Returns ``(new_b64,
+    physical_scale)`` where ``physical_scale = orig_png_w / output_w``
+    is kept for debug logging.
 
-    Returns ``(new_b64, physical_scale, css_scale)`` where
-    - ``physical_scale = orig_png_w / _SCREENSHOT_WIDTH`` (image → physical px)
-    - ``css_scale      = css_width / _SCREENSHOT_WIDTH`` (image → CSS px)
-
-    Highlight rects arrive in CSS px and are divided by ``css_scale``
-    before drawing so overlays land in the correct spot on the
-    800-wide output.
+    Highlight rects arrive in CSS px; they're converted to image-space
+    for overlay drawing via the local ``css_to_image = css_width /
+    output_w`` factor (computed inline — no external cache).
     """
     if not css_width or css_width <= 0:
         # Bridge always supplies css_width from window.innerWidth; only
         # reach here on a degraded response. Return the raw PNG.
-        return data, 1.0, 1.0
+        return data, 1.0
 
     try:
         from PIL import Image, ImageDraw, ImageFont
@@ -78,17 +79,15 @@ def _resize_and_annotate(
 
             orig_w = struct.unpack(">I", raw[16:20])[0]
         physical_scale = orig_w / _SCREENSHOT_WIDTH if orig_w else 1.0
-        css_scale = css_width / _SCREENSHOT_WIDTH
         logger.warning(
             "PIL not available — screenshot resize SKIPPED. "
             "Returning raw physical-px PNG. physicalScale=%.4f, "
-            "cssScale=%.4f, css_width=%d, dpr=%s. Install Pillow for correct clicks.",
+            "css_width=%d, dpr=%s. Install Pillow for annotation.",
             physical_scale,
-            css_scale,
             css_width,
             dpr,
         )
-        return data, round(physical_scale, 4), round(css_scale, 4)
+        return data, round(physical_scale, 4)
 
     try:
         raw = base64.b64decode(data)
@@ -96,14 +95,17 @@ def _resize_and_annotate(
         orig_w, orig_h = img.size
 
         physical_scale = orig_w / _SCREENSHOT_WIDTH
-        css_scale = css_width / _SCREENSHOT_WIDTH
         new_w = _SCREENSHOT_WIDTH
         new_h = round(orig_h * new_w / orig_w)
         if (new_w, new_h) != img.size:
             img = img.resize((new_w, new_h), Image.LANCZOS)
 
+        # Local CSS → image px factor for overlay draws. Kept local —
+        # not exported, not stored, not leaked to the agent.
+        css_to_image = css_width / _SCREENSHOT_WIDTH
+
         logger.info(
-            "Screenshot: orig=%dx%d → out=%dx%d (css_width=%d, dpr=%s), physicalScale=%.4f, cssScale=%.4f",
+            "Screenshot: orig=%dx%d → out=%dx%d (css_width=%d, dpr=%s), physicalScale=%.4f, css_to_image=%.4f",
             orig_w,
             orig_h,
             new_w,
@@ -111,7 +113,7 @@ def _resize_and_annotate(
             css_width,
             dpr,
             physical_scale,
-            css_scale,
+            css_to_image,
         )
 
         if highlights:
@@ -126,10 +128,10 @@ def _resize_and_annotate(
                 kind = h.get("kind", "rect")
                 label = h.get("label", "")
                 # Highlights arrive in CSS px → convert to image px.
-                ix = h["x"] / css_scale
-                iy = h["y"] / css_scale
-                iw = h.get("w", 0) / css_scale
-                ih = h.get("h", 0) / css_scale
+                ix = h["x"] / css_to_image
+                iy = h["y"] / css_to_image
+                iw = h.get("w", 0) / css_to_image
+                ih = h.get("h", 0) / css_to_image
 
                 if kind == "point":
                     cx, cy, r = ix, iy, 10
@@ -169,7 +171,6 @@ def _resize_and_annotate(
         return (
             base64.b64encode(buf.getvalue()).decode(),
             round(physical_scale, 4),
-            round(css_scale, 4),
         )
     except Exception:
         logger.warning(
@@ -179,30 +180,37 @@ def _resize_and_annotate(
             dpr,
             exc_info=True,
         )
-        return data, 1.0, 1.0
+        return data, 1.0
 
 
-async def _ensure_css_scale(tab_id: int) -> float:
-    """Return the image→CSS scale for ``tab_id``, populating the cache
-    via ``window.innerWidth`` if missing. Used by click tools when the
-    agent clicks before the first screenshot has been taken.
+async def _ensure_viewport_size(tab_id: int) -> tuple[int, int]:
+    """Return ``(cssWidth, cssHeight)`` for ``tab_id``, populating the
+    cache via ``window.innerWidth`` / ``window.innerHeight`` on miss.
+
+    Used by click / hover / press tools to turn fractional inputs
+    (0..1) into CSS px, and by rect tools to turn CSS-px rects into
+    fractions. Degrades to ``(1, 1)`` if the bridge can't be queried
+    — that makes every coord an identity op, which is a safe no-op
+    (and preferable to crashing).
     """
-    cached = _screenshot_css_scales.get(tab_id)
-    if cached is not None and cached > 0:
+    cached = _viewport_sizes.get(tab_id)
+    if cached is not None and cached[0] > 0 and cached[1] > 0:
         return cached
     bridge = get_bridge()
     try:
-        result = await bridge.evaluate(tab_id, "({w: window.innerWidth})")
-        inner = float(((result or {}).get("result") or {}).get("w") or 0)
+        result = await bridge.evaluate(tab_id, "({w: window.innerWidth, h: window.innerHeight})")
+        inner = (result or {}).get("result") or {}
+        cw = int(float(inner.get("w") or 0))
+        ch = int(float(inner.get("h") or 0))
     except Exception:
-        inner = 0.0
-    if inner <= 0:
-        # Degraded: no viewport width available. Treat image px as CSS px.
-        scale = 1.0
-    else:
-        scale = inner / _SCREENSHOT_WIDTH
-    _screenshot_css_scales[tab_id] = scale
-    return scale
+        cw, ch = 0, 0
+    if cw <= 0 or ch <= 0:
+        # Degraded: bridge didn't return viewport. Cache an identity
+        # so we don't retry on every call; corrects itself after the
+        # next successful browser_screenshot.
+        cw, ch = 1, 1
+    _viewport_sizes[tab_id] = (cw, ch)
+    return cw, ch
 
 
 def register_inspection_tools(mcp: FastMCP) -> None:
@@ -219,22 +227,28 @@ def register_inspection_tools(mcp: FastMCP) -> None:
         """
         Take a screenshot of the current page.
 
-        Image is 800 px wide (JPEG quality 75, ~50–120 KB). A pixel you
-        see in this image is the same number you pass to
-        ``browser_click_coordinate`` / ``browser_hover_coordinate`` /
-        ``browser_press_at`` — the tools translate to CSS internally.
+        Image is 800 px wide (JPEG quality 75, ~50–120 KB). All
+        coordinate tools work in **fractions of the viewport (0..1)**,
+        not pixels — so read a target's proportional position off this
+        image ("~35 % from the left, ~20 % from the top") and pass
+        ``(0.35, 0.20)`` to ``browser_click_coordinate`` /
+        ``browser_hover_coordinate`` / ``browser_press_at``.
         ``browser_get_rect`` and ``browser_shadow_query`` likewise
-        return coordinates in screenshot pixels.
+        return coordinates as fractions.
 
         Args:
             tab_id: Chrome tab ID (default: active tab)
             profile: Browser profile name (default: "default")
-            full_page: Capture full scrollable page (default: False)
+            full_page: Capture full scrollable page (default: False).
+                Note: full_page images extend beyond the viewport, so
+                fractions read off them do NOT map cleanly to
+                viewport-space clicks. Use for reading / overview only,
+                not for pointing.
             selector: CSS selector to screenshot a specific element (optional)
             annotate: Draw bounding box of last interaction on image (default: True)
 
         Returns:
-            List of content blocks: text metadata + image
+            List of content blocks: text metadata + image.
         """
         start = time.perf_counter()
         params = {
@@ -299,18 +313,20 @@ def register_inspection_tools(mcp: FastMCP) -> None:
             # Image.open/resize/ImageDraw/composite on a 2-megapixel
             # PNG blocks for ~150–300 ms of CPU — plenty to freeze the
             # asyncio event loop. Reentrant: no shared state.
-            data, physical_scale, css_scale = await asyncio.to_thread(
+            data, physical_scale = await asyncio.to_thread(
                 _resize_and_annotate,
                 data,
                 css_width,
                 dpr,
                 highlights,
             )
-            # Refresh caches so click / hover / press / rect tools can
-            # translate image px ↔ CSS px without asking the page again.
-            if target_tab is not None:
+            # Cache live viewport dimensions so click / hover / press /
+            # rect tools can translate fractions ↔ CSS px without
+            # asking the page again.
+            css_height = int(screenshot_result.get("cssHeight", 0)) or 0
+            if target_tab is not None and css_width > 0 and css_height > 0:
+                _viewport_sizes[target_tab] = (int(css_width), css_height)
                 _screenshot_scales[target_tab] = physical_scale
-                _screenshot_css_scales[target_tab] = css_scale
 
             meta = json.dumps(
                 {
@@ -321,18 +337,21 @@ def register_inspection_tools(mcp: FastMCP) -> None:
                     "size": len(base64.b64decode(data)) if data else 0,
                     "imageWidth": _SCREENSHOT_WIDTH,
                     "cssWidth": css_width,
+                    "cssHeight": css_height,
                     "fullPage": full_page,
                     "devicePixelRatio": dpr,
                     "physicalScale": physical_scale,
-                    "cssScale": css_scale,
                     "annotated": bool(highlights),
                     "scaleHint": (
-                        "Image is 800 px wide. Pass pixel coordinates "
-                        "you read off this image straight into "
+                        "Coordinates for click / hover / press are "
+                        "fractions 0..1 of the viewport. Read a "
+                        "target's proportional position off this image "
+                        "(e.g. '~35 % from the left, ~20 % from the top' "
+                        "→ (0.35, 0.20)) and pass that to "
                         "browser_click_coordinate / "
-                        "browser_hover_coordinate / browser_press_at — "
-                        "the tools translate image px → CSS px "
-                        "internally (cssScale is for debug only)."
+                        "browser_hover_coordinate / browser_press_at. "
+                        "browser_get_rect / browser_shadow_query / "
+                        "focused_element.rect return fractions too."
                     ),
                 }
             )
@@ -345,7 +364,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
                     "size": len(base64.b64decode(data)) if data else 0,
                     "url": screenshot_result.get("url", ""),
                     "cssWidth": css_width,
-                    "cssScale": css_scale,
+                    "cssHeight": css_height,
                     "physicalScale": physical_scale,
                     "dpr": dpr,
                 },
@@ -376,9 +395,9 @@ def register_inspection_tools(mcp: FastMCP) -> None:
 
         Traverses shadow roots to find elements inside closed/open shadow DOM,
         overlays, and virtual-rendered components (e.g. LinkedIn's #interop-outlet).
-        Returns the element's bounding rect in screenshot pixels — feed
-        ``rect.cx`` / ``rect.cy`` straight into browser_click_coordinate
-        / hover_coordinate / press_at.
+        Returns the element's bounding rect as **fractions of the
+        viewport (0..1)** — feed ``rect.cx`` / ``rect.cy`` straight
+        into browser_click_coordinate / hover_coordinate / press_at.
 
         Args:
             selector: CSS selectors joined by ' >>> ' to pierce shadow roots.
@@ -387,7 +406,8 @@ def register_inspection_tools(mcp: FastMCP) -> None:
             profile: Browser profile name (default: "default")
 
         Returns:
-            Dict with ``rect`` block (x, y, w, h, cx, cy) in screenshot pixels.
+            Dict with ``rect`` block (x, y, w, h, cx, cy) as fractions,
+            plus ``cssWidth`` / ``cssHeight`` for reference.
         """
         bridge = get_bridge()
         if not bridge or not bridge.is_connected:
@@ -404,23 +424,26 @@ def register_inspection_tools(mcp: FastMCP) -> None:
             return result
 
         rect = result["rect"]
-        css_scale = await _ensure_css_scale(target_tab)
-        s = css_scale if css_scale > 0 else 1.0
+        cw, ch = await _ensure_viewport_size(target_tab)
+        cw_f = float(cw) if cw > 0 else 1.0
+        ch_f = float(ch) if ch > 0 else 1.0
         return {
             "ok": True,
             "selector": selector,
             "tag": rect.get("tag"),
             "rect": {
-                "x": round(rect["x"] / s, 1),
-                "y": round(rect["y"] / s, 1),
-                "w": round(rect["w"] / s, 1),
-                "h": round(rect["h"] / s, 1),
-                "cx": round(rect["cx"] / s, 1),
-                "cy": round(rect["cy"] / s, 1),
+                "x": round(rect["x"] / cw_f, 4),
+                "y": round(rect["y"] / ch_f, 4),
+                "w": round(rect["w"] / cw_f, 4),
+                "h": round(rect["h"] / ch_f, 4),
+                "cx": round(rect["cx"] / cw_f, 4),
+                "cy": round(rect["cy"] / ch_f, 4),
             },
+            "cssWidth": cw,
+            "cssHeight": ch,
             "note": (
-                "rect fields are in screenshot pixels. Pass rect.cx / "
-                "rect.cy to browser_click_coordinate / "
+                "rect fields are fractions of the viewport (0..1). "
+                "Pass rect.cx / rect.cy to browser_click_coordinate / "
                 "hover_coordinate / press_at."
             ),
         }
@@ -435,9 +458,9 @@ def register_inspection_tools(mcp: FastMCP) -> None:
         Get the bounding rect of an element by CSS selector.
 
         Supports '>>>' shadow-piercing selectors for overlay/shadow DOM
-        content. Returns the rect in screenshot pixels — the same
-        numbers you'd read off a browser_screenshot, and the same
-        numbers browser_click_coordinate expects.
+        content. Returns the rect as **fractions of the viewport
+        (0..1)** — the same coordinate space browser_click_coordinate
+        / hover_coordinate / press_at expect.
 
         Args:
             selector: CSS selector, optionally with ' >>> ' to pierce shadow roots.
@@ -446,7 +469,8 @@ def register_inspection_tools(mcp: FastMCP) -> None:
             profile: Browser profile name (default: "default")
 
         Returns:
-            Dict with ``rect`` block (x, y, w, h, cx, cy) in screenshot pixels.
+            Dict with ``rect`` block (x, y, w, h, cx, cy) as fractions,
+            plus ``cssWidth`` / ``cssHeight`` for reference.
         """
         bridge = get_bridge()
         if not bridge or not bridge.is_connected:
@@ -463,23 +487,26 @@ def register_inspection_tools(mcp: FastMCP) -> None:
             return result
 
         rect = result["rect"]
-        css_scale = await _ensure_css_scale(target_tab)
-        s = css_scale if css_scale > 0 else 1.0
+        cw, ch = await _ensure_viewport_size(target_tab)
+        cw_f = float(cw) if cw > 0 else 1.0
+        ch_f = float(ch) if ch > 0 else 1.0
         return {
             "ok": True,
             "selector": selector,
             "tag": rect.get("tag"),
             "rect": {
-                "x": round(rect["x"] / s, 1),
-                "y": round(rect["y"] / s, 1),
-                "w": round(rect["w"] / s, 1),
-                "h": round(rect["h"] / s, 1),
-                "cx": round(rect["cx"] / s, 1),
-                "cy": round(rect["cy"] / s, 1),
+                "x": round(rect["x"] / cw_f, 4),
+                "y": round(rect["y"] / ch_f, 4),
+                "w": round(rect["w"] / cw_f, 4),
+                "h": round(rect["h"] / ch_f, 4),
+                "cx": round(rect["cx"] / cw_f, 4),
+                "cy": round(rect["cy"] / ch_f, 4),
             },
+            "cssWidth": cw,
+            "cssHeight": ch,
             "note": (
-                "rect fields are in screenshot pixels. Pass rect.cx / "
-                "rect.cy to browser_click_coordinate / "
+                "rect fields are fractions of the viewport (0..1). "
+                "Pass rect.cx / rect.cy to browser_click_coordinate / "
                 "hover_coordinate / press_at."
             ),
         }
diff --git a/tools/src/gcu/browser/tools/interactions.py b/tools/src/gcu/browser/tools/interactions.py
index ee5596fc..06181f99 100644
--- a/tools/src/gcu/browser/tools/interactions.py
+++ b/tools/src/gcu/browser/tools/interactions.py
@@ -108,25 +108,31 @@ def register_interaction_tools(mcp: FastMCP) -> None:
         button: Literal["left", "right", "middle"] = "left",
     ) -> dict:
         """
-        Click at the given SCREENSHOT pixel.
+        Click at a FRACTION of the viewport (0..1, 0..1).
 
-        ``x`` and ``y`` are pixel coordinates read directly off a
-        ``browser_screenshot`` image (800 px wide JPEG). The tool
-        multiplies them by the cached image→CSS scale for the tab
-        before dispatching to Chrome — no scale awareness required on
-        the caller side. ``browser_get_rect`` / ``browser_shadow_query``
-        return coordinates in the same (screenshot) space.
+        Coordinates are **fractions of the viewport**, not pixels:
+        ``(0.5, 0.5)`` is the center, ``(0.1, 0.2)`` is 10 % from the
+        left and 20 % from the top. Read a target's proportional
+        position off ``browser_screenshot`` (or pass
+        ``rect.cx`` / ``rect.cy`` from ``browser_get_rect`` /
+        ``browser_shadow_query`` directly — they return fractions too).
+
+        Fractions are used because every vision model resizes or tiles
+        images differently (Claude ~1.15 MP target, GPT-4o 512-px
+        tiles, etc.). Proportional positions survive every such
+        transform; pixel coords do not.
 
         Args:
-            x: X coordinate in screenshot pixels.
-            y: Y coordinate in screenshot pixels.
+            x: X fraction of the viewport (0..1).
+            y: Y fraction of the viewport (0..1).
             tab_id: Chrome tab ID (default: active tab)
             profile: Browser profile name (default: "default")
             button: Mouse button to click (left, right, middle)
 
         Returns:
             Dict with click result, including ``focused_element``
-            describing what the click focused.
+            describing what the click focused. ``focused_element.rect``
+            is also in fractions.
         """
         start = time.perf_counter()
         params = {"x": x, "y": y, "tab_id": tab_id, "profile": profile, "button": button}
@@ -149,18 +155,33 @@ def register_interaction_tools(mcp: FastMCP) -> None:
             log_tool_call("browser_click_coordinate", params, result=result)
             return result
 
-        try:
-            from .inspection import _ensure_css_scale
+        # Pixel-input guard: legitimate fractions live in [0, 1]. Allow a
+        # small overshoot tolerance for edge targets.
+        if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
+            result = {
+                "ok": False,
+                "error": (
+                    f"Coords ({x}, {y}) look like pixels. This tool expects "
+                    "fractions 0..1 of the viewport. Read the target's "
+                    "proportional position off browser_screenshot, or pass "
+                    "rect.cx / rect.cy from browser_get_rect / "
+                    "browser_shadow_query (they return fractions)."
+                ),
+            }
+            log_tool_call("browser_click_coordinate", params, result=result)
+            return result
 
-            css_scale = await _ensure_css_scale(target_tab)
-            s = css_scale if css_scale > 0 else 1.0
-            css_x = x * s
-            css_y = y * s
+        try:
+            from .inspection import _ensure_viewport_size
+
+            cw, ch = await _ensure_viewport_size(target_tab)
+            css_x = x * cw
+            css_y = y * ch
             click_result = await bridge.click_coordinate(target_tab, css_x, css_y, button=button)
             log_tool_call(
                 "browser_click_coordinate",
                 params,
-                result={**click_result, "cssScale": round(css_scale, 4)},
+                result={**click_result, "cssWidth": cw, "cssHeight": ch},
                 duration_ms=(time.perf_counter() - start) * 1000,
             )
             return click_result
@@ -485,17 +506,16 @@ def register_interaction_tools(mcp: FastMCP) -> None:
         profile: str | None = None,
     ) -> dict:
         """
-        Hover at the given SCREENSHOT pixel.
+        Hover at a FRACTION of the viewport (0..1, 0..1).
 
         Use this instead of browser_hover when the element is in an overlay,
         shadow DOM, or virtual-rendered component that isn't in the regular DOM.
-        ``x`` / ``y`` are pixel coordinates read directly off a
-        ``browser_screenshot`` image; the tool translates to CSS px
-        internally before dispatching to Chrome.
+        ``x`` / ``y`` are fractions of the viewport (``0.5`` = center);
+        the tool converts to CSS px internally.
 
         Args:
-            x: X coordinate in screenshot pixels.
-            y: Y coordinate in screenshot pixels.
+            x: X fraction of the viewport (0..1).
+            y: Y fraction of the viewport (0..1).
             tab_id: Chrome tab ID (default: active tab)
             profile: Browser profile name (default: "default")
 
@@ -523,12 +543,22 @@ def register_interaction_tools(mcp: FastMCP) -> None:
             log_tool_call("browser_hover_coordinate", params, result=result)
             return result
 
-        try:
-            from .inspection import _ensure_css_scale
+        if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
+            result = {
+                "ok": False,
+                "error": (
+                    f"Coords ({x}, {y}) look like pixels. This tool expects "
+                    "fractions 0..1 of the viewport."
+                ),
+            }
+            log_tool_call("browser_hover_coordinate", params, result=result)
+            return result
 
-            css_scale = await _ensure_css_scale(target_tab)
-            s = css_scale if css_scale > 0 else 1.0
-            hover_result = await bridge.hover_coordinate(target_tab, x * s, y * s)
+        try:
+            from .inspection import _ensure_viewport_size
+
+            cw, ch = await _ensure_viewport_size(target_tab)
+            hover_result = await bridge.hover_coordinate(target_tab, x * cw, y * ch)
             log_tool_call(
                 "browser_hover_coordinate",
                 params,
@@ -555,18 +585,17 @@ def register_interaction_tools(mcp: FastMCP) -> None:
         profile: str | None = None,
     ) -> dict:
         """
-        Move mouse to the given SCREENSHOT pixel, then press a key.
+        Move mouse to a FRACTION of the viewport (0..1, 0..1), then press a key.
 
         Use this instead of browser_press when the focused element is in an overlay
         or virtual-rendered component. Moving the mouse first routes the key event
         through native browser hit-testing instead of the DOM focus chain.
-        ``x`` / ``y`` are pixel coordinates read directly off a
-        ``browser_screenshot`` image; the tool translates to CSS px
-        internally.
+        ``x`` / ``y`` are fractions of the viewport; the tool converts
+        to CSS px internally.
 
         Args:
-            x: X coordinate in screenshot pixels.
-            y: Y coordinate in screenshot pixels.
+            x: X fraction of the viewport (0..1).
+            y: Y fraction of the viewport (0..1).
             key: Key to press (e.g. 'Enter', 'Space', 'Escape', 'ArrowDown')
             tab_id: Chrome tab ID (default: active tab)
             profile: Browser profile name (default: "default")
@@ -595,12 +624,22 @@ def register_interaction_tools(mcp: FastMCP) -> None:
             log_tool_call("browser_press_at", params, result=result)
             return result
 
-        try:
-            from .inspection import _ensure_css_scale
+        if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
+            result = {
+                "ok": False,
+                "error": (
+                    f"Coords ({x}, {y}) look like pixels. This tool expects "
+                    "fractions 0..1 of the viewport."
+                ),
+            }
+            log_tool_call("browser_press_at", params, result=result)
+            return result
 
-            css_scale = await _ensure_css_scale(target_tab)
-            s = css_scale if css_scale > 0 else 1.0
-            press_result = await bridge.press_key_at(target_tab, x * s, y * s, key)
+        try:
+            from .inspection import _ensure_viewport_size
+
+            cw, ch = await _ensure_viewport_size(target_tab)
+            press_result = await bridge.press_key_at(target_tab, x * cw, y * ch, key)
             log_tool_call(
                 "browser_press_at",
                 params,