Merge branch 'feature/full-image-size'

This commit is contained in:
Richard Tang
2026-04-16 23:15:59 -07:00
11 changed files with 415 additions and 340 deletions
+75 -38
View File
@@ -93,33 +93,57 @@ def clear_tab_highlights(tab_ids) -> None:
_interaction_highlights.pop(tid, None)
# Compact descriptor of document.activeElement. Returned by both click()
# Compact descriptor of the focused element. Returned by both click()
# and click_coordinate() so the agent can verify it focused what it
# intended, then decide whether to follow up with browser_type_focused(text=...).
# Keeping this as a single shared string avoids drift
# between the two click paths.
# intended. When the outer document's activeElement is an <iframe>,
# we recurse into the iframe's document (same-origin only) so the
# response describes the real inner element — otherwise the agent
# always sees {tag: "iframe"} and can't tell whether it hit the
# composer or something else inside the frame (e.g. a sidebar item
# in LinkedIn's #interop-outlet messaging overlay).
_FOCUSED_ELEMENT_JS = """
(function() {
function describe(el) {
var rect = el.getBoundingClientRect();
var attrs = {};
for (var i = 0; i < el.attributes.length && i < 10; i++) {
attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
}
return {
tag: el.tagName.toLowerCase(),
id: el.id || null,
className: el.className || null,
name: el.getAttribute('name') || null,
type: el.getAttribute('type') || null,
role: el.getAttribute('role') || null,
contenteditable: el.getAttribute('contenteditable') || null,
text: (el.innerText || '').substring(0, 200),
value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
attributes: attrs,
rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
};
}
var el = document.activeElement;
if (!el || el === document.body) return null;
var rect = el.getBoundingClientRect();
var attrs = {};
for (var i = 0; i < el.attributes.length && i < 10; i++) {
attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
// Descend into same-origin iframes. Capped at 5 levels of
// nesting to bound cost. Cross-origin frames throw on
// contentDocument access we catch and report the outermost
// iframe instead.
var framePath = [];
var depth = 0;
while (el && (el.tagName === 'IFRAME' || el.tagName === 'FRAME') && depth < 5) {
framePath.push(el.id || el.getAttribute('data-testid') || el.tagName.toLowerCase());
var innerDoc = null;
try { innerDoc = el.contentDocument; } catch (e) { innerDoc = null; }
if (!innerDoc) break;
var innerActive = innerDoc.activeElement;
if (!innerActive || innerActive === innerDoc.body) break;
el = innerActive;
depth++;
}
return {
tag: el.tagName.toLowerCase(),
id: el.id || null,
className: el.className || null,
name: el.getAttribute('name') || null,
type: el.getAttribute('type') || null,
role: el.getAttribute('role') || null,
contenteditable: el.getAttribute('contenteditable') || null,
text: (el.innerText || '').substring(0, 200),
value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
attributes: attrs,
rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
};
var out = describe(el);
if (framePath.length) out.inFrame = framePath;
return out;
})()
"""
@@ -477,9 +501,9 @@ class BeelineBridge:
"""Close a tab by ID."""
result = await self._send("tab.close", tabId=tab_id)
# Drop per-tab state — the id may be reused by Chrome much
# later, and carrying a stale highlight, scale, or "attached"
# flag forward would misannotate screenshots, misalign click
# coordinates, or skip a needed reattach on the reused id.
# later, and carrying a stale highlight or "attached" flag
# forward would misannotate screenshots or skip a needed
# reattach on the reused id.
self._cdp_attached.discard(tab_id)
_interaction_highlights.pop(tab_id, None)
from .tools.inspection import clear_tab_state
@@ -953,16 +977,36 @@ class BeelineBridge:
async def _read_focused_element(self, tab_id: int) -> dict | None:
"""Read document.activeElement and return a compact descriptor.
Returns None on any failure never raises. Used by both click
paths (selector-based click() and click_coordinate()) so the
agent gets the same response shape regardless of which one was
called. The descriptor lets the agent answer "did my click land
on an editable?" without a second round-trip.
The JS returns ``rect`` fields in CSS px (they come straight
from ``getBoundingClientRect``). We convert them to fractions
of the viewport here so the agent sees a rect in the same
coord space it passed to click / hover / press_at.
Returns None on any failure never raises.
"""
try:
await self._try_enable_domain(tab_id, "Runtime")
result = await self.evaluate(tab_id, _FOCUSED_ELEMENT_JS)
return (result or {}).get("result")
info = (result or {}).get("result")
if info and isinstance(info, dict) and isinstance(info.get("rect"), dict):
from .tools.inspection import _viewport_sizes
vp = _viewport_sizes.get(tab_id)
if vp and vp[0] > 0 and vp[1] > 0:
cw, ch = float(vp[0]), float(vp[1])
r = info["rect"]
info["rect"] = {
"x": round(r.get("x", 0) / cw, 4),
"y": round(r.get("y", 0) / ch, 4),
"width": round(r.get("width", 0) / cw, 4),
"height": round(r.get("height", 0) / ch, 4),
}
else:
# Degraded: cache missing (no screenshot taken
# yet). Leave rect in CSS px and flag it so the
# agent can tell.
info["rectSpace"] = "css"
return info
except Exception:
return None
@@ -975,18 +1019,11 @@ class BeelineBridge:
button_map = {"left": "left", "right": "right", "middle": "middle"}
cdp_button = button_map.get(button, "left")
from .tools.inspection import _screenshot_css_scales, _screenshot_scales
phys_scale = _screenshot_scales.get(tab_id, "unset")
css_scale = _screenshot_css_scales.get(tab_id, "unset")
logger.info(
"click_coordinate tab=%d: x=%.1f, y=%.1f → CDP Input.dispatchMouseEvent. "
"stored_scales: physicalScale=%s, cssScale=%s",
"click_coordinate tab=%d: x=%.1f, y=%.1f → CDP Input.dispatchMouseEvent",
tab_id,
x,
y,
phys_scale,
css_scale,
)
await self._cdp(
+11
View File
@@ -255,6 +255,17 @@ def register_advanced_tools(mcp: FastMCP) -> None:
try:
result = await bridge.resize(target_tab, width, height)
# Invalidate per-tab scale caches — CSS width changed, so the
# cached viewport dimensions are stale. Click / rect tools
# will re-query innerWidth / innerHeight on next use via
# _ensure_viewport_size.
try:
from .inspection import _screenshot_scales, _viewport_sizes
_viewport_sizes.pop(target_tab, None)
_screenshot_scales.pop(target_tab, None)
except Exception:
pass
return result
except Exception as e:
return {"ok": False, "error": str(e)}
+179 -211
View File
@@ -23,13 +23,26 @@ from .tabs import _get_context
logger = logging.getLogger(__name__)
# Target width for normalized screenshots (px in the delivered image)
_SCREENSHOT_WIDTH = 600
# Maps tab_id -> physical scale: image_coord × scale = physical pixels (for CDP Input events)
# Fixed output width for all screenshots (bandwidth default). This
# number does NOT affect coordinate semantics — click / hover / press
# and rect tools all work in fractions of the viewport (0..1), which
# are invariant to whatever resize / tile the vision API applies. The
# 800 px width is simply small enough to keep JPEG payloads under
# ~150 KB on typical UI screenshots.
_SCREENSHOT_WIDTH = 800
# Per-tab viewport-size cache populated on every browser_screenshot
# and on lazy-init inside the click tools. Stores CSS-pixel viewport
# dimensions (window.innerWidth / window.innerHeight). Click tools
# multiply fractional inputs by these to get CSS coords before
# dispatching CDP events; rect tools divide CSS-pixel DOM rects by
# these to produce fractions for the agent.
_viewport_sizes: dict[int, tuple[int, int]] = {}
# Optional debug cache — physical-px scale per tab (orig_png_w /
# _SCREENSHOT_WIDTH). Logged only; no consumer.
_screenshot_scales: dict[int, float] = {}
# Maps tab_id -> CSS scale: image_coord × scale = CSS pixels (for DOM APIs / getBoundingClientRect)
_screenshot_css_scales: dict[int, float] = {}
def clear_tab_state(tab_ids) -> None:
@@ -51,18 +64,25 @@ def _resize_and_annotate(
css_width: int,
dpr: float = 1.0,
highlights: list[dict] | None = None,
width: int = _SCREENSHOT_WIDTH,
) -> tuple[str, float, float]:
"""Resize a base64 PNG to _SCREENSHOT_WIDTH wide, annotate highlights.
) -> tuple[str, float]:
"""Resize the captured PNG down to ``_SCREENSHOT_WIDTH`` (=800 px)
and re-encode as JPEG quality 75.
Returns (new_b64, physical_scale, css_scale) where:
physical_scale = physical_px_per_image_px (multiply image coords physical px)
css_scale = css_px_per_image_px (multiply image coords CSS px for DOM APIs)
The image dimensions do NOT determine click coordinates any more
the tools work in viewport fractions. This helper exists purely
for bandwidth + annotation overlay. Returns ``(new_b64,
physical_scale)`` where ``physical_scale = orig_png_w / output_w``
is kept for debug logging.
Highlights have x,y,w,h in CSS pixels (what getBoundingClientRect returns,
and what CDP Input.dispatchMouseEvent accepts).
Falls back to original data if Pillow unavailable or resize fails.
Highlight rects arrive in CSS px; they're converted to image-space
for overlay drawing via the local ``css_to_image = css_width /
output_w`` factor (computed inline no external cache).
"""
if not css_width or css_width <= 0:
# Bridge always supplies css_width from window.innerWidth; only
# reach here on a degraded response. Return the raw PNG.
return data, 1.0
try:
from PIL import Image, ImageDraw, ImageFont
except ImportError:
@@ -72,48 +92,44 @@ def _resize_and_annotate(
import struct
orig_w = struct.unpack(">I", raw[16:20])[0]
raw_size_bytes = len(raw)
physical_scale = orig_w / width if orig_w and width else 1.0
css_scale = (css_width / width) if css_width else (physical_scale / max(dpr, 1.0))
physical_scale = orig_w / _SCREENSHOT_WIDTH if orig_w else 1.0
logger.warning(
"PIL not available — screenshot resize SKIPPED (cannot downscale image). "
"raw_size=%d bytes, png_width=%d, css_width=%s, dpr=%s, target_width=%d. "
"Returning ORIGINAL image with computed scales: physicalScale=%.4f, cssScale=%.4f. "
"Agent must use browser_coords() to convert image positions before clicking.",
raw_size_bytes,
orig_w,
"PIL not available — screenshot resize SKIPPED. "
"Returning raw physical-px PNG. physicalScale=%.4f, "
"css_width=%d, dpr=%s. Install Pillow for annotation.",
physical_scale,
css_width,
dpr,
width,
physical_scale,
css_scale,
)
return data, round(physical_scale, 4), round(css_scale, 4)
return data, round(physical_scale, 4)
try:
raw = base64.b64decode(data)
img = Image.open(io.BytesIO(raw)).convert("RGBA")
orig_w, orig_h = img.size
physical_scale = orig_w / width
css_scale = (css_width / width) if css_width else (physical_scale / max(dpr, 1.0))
physical_scale = orig_w / _SCREENSHOT_WIDTH
new_w = _SCREENSHOT_WIDTH
new_h = round(orig_h * new_w / orig_w)
if (new_w, new_h) != img.size:
img = img.resize((new_w, new_h), Image.LANCZOS)
# Local CSS → image px factor for overlay draws. Kept local —
# not exported, not stored, not leaked to the agent.
css_to_image = css_width / _SCREENSHOT_WIDTH
logger.info(
"Screenshot resize: orig=%dx%dtarget=%dx%d, css_width=%s, dpr=%s, physicalScale=%.4f, cssScale=%.4f",
"Screenshot: orig=%dx%dout=%dx%d (css_width=%d, dpr=%s), physicalScale=%.4f, css_to_image=%.4f",
orig_w,
orig_h,
width,
round(orig_h * width / orig_w),
new_w,
new_h,
css_width,
dpr,
physical_scale,
css_scale,
css_to_image,
)
new_w = width
new_h = round(orig_h * new_w / orig_w)
img = img.resize((new_w, new_h), Image.LANCZOS)
if highlights:
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
draw = ImageDraw.Draw(overlay)
@@ -125,11 +141,11 @@ def _resize_and_annotate(
for h in highlights:
kind = h.get("kind", "rect")
label = h.get("label", "")
# Highlights are in CSS px → convert to image px
ix = h["x"] / css_scale
iy = h["y"] / css_scale
iw = h.get("w", 0) / css_scale
ih = h.get("h", 0) / css_scale
# Highlights arrive in CSS px → convert to image px.
ix = h["x"] / css_to_image
iy = h["y"] / css_to_image
iw = h.get("w", 0) / css_to_image
ih = h.get("h", 0) / css_to_image
if kind == "point":
cx, cy, r = ix, iy, 10
@@ -149,11 +165,9 @@ def _resize_and_annotate(
width=2,
)
# Label: show image pixel position so user knows where to look
img_coords = f"img:({round(ix)},{round(iy)})"
display_label = f"{img_coords} {label}" if label else img_coords
display_label = f"({round(ix)},{round(iy)}) {label}".strip()
lx, ly = ix, max(2, iy - 16)
lx = max(2, min(lx, width - 120))
lx = max(2, min(lx, new_w - 120))
bbox = draw.textbbox((lx, ly), display_label, font=font)
pad = 3
draw.rectangle(
@@ -167,22 +181,50 @@ def _resize_and_annotate(
img = img.convert("RGB")
buf = io.BytesIO()
img.save(buf, format="PNG", optimize=True)
img.save(buf, format="JPEG", quality=75, optimize=True)
return (
base64.b64encode(buf.getvalue()).decode(),
round(physical_scale, 4),
round(css_scale, 4),
)
except Exception:
logger.warning(
"Screenshot resize/annotate FAILED — returning original image with scale=1.0. "
"css_width=%s, dpr=%s, target_width=%d. Clicks will be misaligned.",
"Screenshot resize/annotate FAILED — returning original image. "
"css_width=%s, dpr=%s.",
css_width,
dpr,
width,
exc_info=True,
)
return data, 1.0, 1.0
return data, 1.0
async def _ensure_viewport_size(tab_id: int) -> tuple[int, int]:
"""Return ``(cssWidth, cssHeight)`` for ``tab_id``, populating the
cache via ``window.innerWidth`` / ``window.innerHeight`` on miss.
Used by click / hover / press tools to turn fractional inputs
(0..1) into CSS px, and by rect tools to turn CSS-px rects into
fractions. Degrades to ``(1, 1)`` if the bridge can't be queried
that makes every coord an identity op, which is a safe no-op
(and preferable to crashing).
"""
cached = _viewport_sizes.get(tab_id)
if cached is not None and cached[0] > 0 and cached[1] > 0:
return cached
bridge = get_bridge()
try:
result = await bridge.evaluate(tab_id, "({w: window.innerWidth, h: window.innerHeight})")
inner = (result or {}).get("result") or {}
cw = int(float(inner.get("w") or 0))
ch = int(float(inner.get("h") or 0))
except Exception:
cw, ch = 0, 0
if cw <= 0 or ch <= 0:
# Degraded: bridge didn't return viewport. Cache an identity
# so we don't retry on every call; corrects itself after the
# next successful browser_screenshot.
cw, ch = 1, 1
_viewport_sizes[tab_id] = (cw, ch)
return cw, ch
def register_inspection_tools(mcp: FastMCP) -> None:
@@ -194,29 +236,33 @@ def register_inspection_tools(mcp: FastMCP) -> None:
profile: str | None = None,
full_page: bool = False,
selector: str | None = None,
image_type: Literal["png", "jpeg"] = "png",
annotate: bool = True,
width: int = _SCREENSHOT_WIDTH,
) -> list:
"""
Take a screenshot of the current page.
Returns a normalized image alongside text metadata (URL, size, scale
factors, etc.). Automatically annotates the last interaction (click,
hover, type) with a bounding box overlay.
Image is 800 px wide (JPEG quality 75, ~50120 KB). All
coordinate tools work in **fractions of the viewport (0..1)**,
not pixels so read a target's proportional position off this
image ("~35 % from the left, ~20 % from the top") and pass
``(0.35, 0.20)`` to ``browser_click_coordinate`` /
``browser_hover_coordinate`` / ``browser_press_at``.
``browser_get_rect`` and ``browser_shadow_query`` likewise
return coordinates as fractions.
Args:
tab_id: Chrome tab ID (default: active tab)
profile: Browser profile name (default: "default")
full_page: Capture full scrollable page (default: False)
full_page: Capture full scrollable page (default: False).
Note: full_page images extend beyond the viewport, so
fractions read off them do NOT map cleanly to
viewport-space clicks. Use for reading / overview only,
not for pointing.
selector: CSS selector to screenshot a specific element (optional)
image_type: Image format - png or jpeg (default: png)
annotate: Draw bounding box of last interaction on image (default: True)
width: Output image width in pixels (default: 600). Use 800+ for fine
text, 400 for quick layout checks.
Returns:
List of content blocks: text metadata + image
List of content blocks: text metadata + image.
"""
start = time.perf_counter()
params = {
@@ -266,7 +312,6 @@ def register_inspection_tools(mcp: FastMCP) -> None:
return [TextContent(type="text", text=json.dumps(screenshot_result))]
data = screenshot_result.get("data")
mime_type = screenshot_result.get("mimeType", "image/png")
css_width = screenshot_result.get("cssWidth", 0)
dpr = screenshot_result.get("devicePixelRatio", 1.0)
@@ -277,45 +322,50 @@ def register_inspection_tools(mcp: FastMCP) -> None:
if annotate and target_tab in _interaction_highlights:
highlights = [_interaction_highlights[target_tab]]
# Normalize to 800px wide and annotate. Offloaded to a
# thread because PIL Image.open/resize/ImageDraw/composite on
# a 2-megapixel PNG blocks for ~150-300ms of CPU — plenty to
# freeze the asyncio event loop and delay every concurrent
# tool call during a screenshot. The function is reentrant
# (fresh PIL Image per call, no shared state), so to_thread
# is safe.
data, physical_scale, css_scale = await asyncio.to_thread(
# Resize to CSS-viewport dimensions (image px == CSS px)
# and re-encode as JPEG. Offloaded to a thread because PIL
# Image.open/resize/ImageDraw/composite on a 2-megapixel
# PNG blocks for ~150300 ms of CPU — plenty to freeze the
# asyncio event loop. Reentrant: no shared state.
data, physical_scale = await asyncio.to_thread(
_resize_and_annotate,
data,
css_width,
dpr,
highlights,
width,
)
_screenshot_scales[target_tab] = physical_scale
_screenshot_css_scales[target_tab] = css_scale
# Cache live viewport dimensions so click / hover / press /
# rect tools can translate fractions ↔ CSS px without
# asking the page again.
css_height = int(screenshot_result.get("cssHeight", 0)) or 0
if target_tab is not None and css_width > 0 and css_height > 0:
_viewport_sizes[target_tab] = (int(css_width), css_height)
_screenshot_scales[target_tab] = physical_scale
meta = json.dumps(
{
"ok": True,
"tabId": target_tab,
"url": screenshot_result.get("url", ""),
"imageType": mime_type.split("/")[-1],
"imageType": "jpeg",
"size": len(base64.b64decode(data)) if data else 0,
"imageWidth": width,
"imageWidth": _SCREENSHOT_WIDTH,
"cssWidth": css_width,
"cssHeight": css_height,
"fullPage": full_page,
"devicePixelRatio": dpr,
"physicalScale": physical_scale,
"cssScale": css_scale,
"annotated": bool(highlights),
"scaleHint": (
f"image_coord × {css_scale} = CSS px "
f"→ feed to browser_click_coordinate, "
f"browser_hover_coordinate, browser_press_at "
f"(CDP Input events use CSS pixels). "
f"image_coord × {physical_scale} = physical px "
f"is debug-only on HiDPI displays and must NOT "
f"be used for clicks — it overshoots by DPR×."
"Coordinates for click / hover / press are "
"fractions 0..1 of the viewport. Read a "
"target's proportional position off this image "
"(e.g. '~35 % from the left, ~20 % from the top' "
"→ (0.35, 0.20)) and pass that to "
"browser_click_coordinate / "
"browser_hover_coordinate / browser_press_at. "
"browser_get_rect / browser_shadow_query / "
"focused_element.rect return fractions too."
),
}
)
@@ -327,17 +377,17 @@ def register_inspection_tools(mcp: FastMCP) -> None:
"ok": True,
"size": len(base64.b64decode(data)) if data else 0,
"url": screenshot_result.get("url", ""),
"cssWidth": css_width,
"cssHeight": css_height,
"physicalScale": physical_scale,
"cssScale": css_scale,
"debug_cssWidth": css_width,
"debug_dpr": dpr,
"dpr": dpr,
},
duration_ms=(time.perf_counter() - start) * 1000,
)
return [
TextContent(type="text", text=meta),
ImageContent(type="image", data=data, mimeType=mime_type),
ImageContent(type="image", data=data, mimeType="image/jpeg"),
]
except Exception as e:
log_tool_call(
@@ -348,73 +398,6 @@ def register_inspection_tools(mcp: FastMCP) -> None:
)
return [TextContent(type="text", text=json.dumps({"ok": False, "error": str(e)}))]
@mcp.tool()
def browser_coords(
x: float,
y: float,
tab_id: int | None = None,
profile: str | None = None,
) -> dict:
"""
Convert screenshot image coordinates to browser click coordinates.
After browser_screenshot returns a downscaled image, use this to
translate pixel positions you see in the image into the CSS pixel
coordinates that Chrome DevTools Protocol expects.
**CDP Input.dispatchMouseEvent uses CSS pixels**, so you want
``css_x`` / ``css_y`` for every click/hover tool. ``physical_x/y``
is kept in the return for debugging on HiDPI displays do NOT
feed it to clicks; on a DPR=2 screen it lands 2× too far.
Edge case: pages using ``zoom`` or ``transform: scale()`` (e.g.
LinkedIn's ``#interop-outlet`` shadow DOM) render in a scaled
local coordinate space. For those, ``getBoundingClientRect()``
reports pre-zoom coordinates and you may still need to multiply
by the element's effective zoom. Use browser_shadow_query to
get the zoomed rect directly.
Args:
x: X pixel position in the screenshot image
y: Y pixel position in the screenshot image
tab_id: Chrome tab ID (default: active tab for profile)
profile: Browser profile name (default: "default")
Returns:
Dict with css_x, css_y (primary use these), physical_x,
physical_y (debug only), and scale factors.
"""
ctx = _get_context(profile)
target_tab = tab_id or (ctx.get("activeTabId") if ctx else None)
physical_scale = _screenshot_scales.get(target_tab, 1.0) if target_tab else 1.0
# css_scale stored in second slot via _screenshot_css_scales
css_scale = _screenshot_css_scales.get(target_tab, physical_scale) if target_tab else physical_scale
return {
"ok": True,
# Primary output: CSS pixels. Feed these to click/hover/press.
"css_x": round(x * css_scale, 1),
"css_y": round(y * css_scale, 1),
# Debug output: raw physical pixels. DO NOT feed to clicks on
# HiDPI displays — CDP Input events use CSS pixels, so sending
# physical coordinates lands the click at roughly DPR× the
# intended position.
"physical_x": round(x * physical_scale, 1),
"physical_y": round(y * physical_scale, 1),
"physicalScale": physical_scale,
"cssScale": css_scale,
"tabId": target_tab,
"note": (
"Use css_x/css_y with browser_click_coordinate, "
"browser_hover_coordinate, browser_press_at — "
"Chrome DevTools Protocol Input.dispatchMouseEvent "
"operates in CSS pixels. physical_x/y is for debugging "
"on HiDPI displays only; feeding it to clicks lands "
"them at DPR× the intended coordinate."
),
}
@mcp.tool()
async def browser_shadow_query(
selector: str,
@@ -426,7 +409,9 @@ def register_inspection_tools(mcp: FastMCP) -> None:
Traverses shadow roots to find elements inside closed/open shadow DOM,
overlays, and virtual-rendered components (e.g. LinkedIn's #interop-outlet).
Returns getBoundingClientRect in both CSS and physical pixels.
Returns the element's bounding rect as **fractions of the
viewport (0..1)** feed ``rect.cx`` / ``rect.cy`` straight
into browser_click_coordinate / hover_coordinate / press_at.
Args:
selector: CSS selectors joined by ' >>> ' to pierce shadow roots.
@@ -435,7 +420,8 @@ def register_inspection_tools(mcp: FastMCP) -> None:
profile: Browser profile name (default: "default")
Returns:
Dict with rect (CSS px) and physical rect (CSS px × DPR) of the element
Dict with ``rect`` block (x, y, w, h, cx, cy) as fractions,
plus ``cssWidth`` / ``cssHeight`` for reference.
"""
bridge = get_bridge()
if not bridge or not bridge.is_connected:
@@ -452,36 +438,27 @@ def register_inspection_tools(mcp: FastMCP) -> None:
return result
rect = result["rect"]
physical_scale = _screenshot_scales.get(target_tab, 1.0)
css_scale = _screenshot_css_scales.get(target_tab, 1.0)
dpr = physical_scale / css_scale if css_scale else 1.0
cw, ch = await _ensure_viewport_size(target_tab)
cw_f = float(cw) if cw > 0 else 1.0
ch_f = float(ch) if ch > 0 else 1.0
return {
"ok": True,
"selector": selector,
"tag": rect.get("tag"),
"css": {
"x": rect["x"],
"y": rect["y"],
"w": rect["w"],
"h": rect["h"],
"cx": rect["cx"],
"cy": rect["cy"],
},
"physical": {
"x": round(rect["x"] * dpr, 1),
"y": round(rect["y"] * dpr, 1),
"w": round(rect["w"] * dpr, 1),
"h": round(rect["h"] * dpr, 1),
"cx": round(rect["cx"] * dpr, 1),
"cy": round(rect["cy"] * dpr, 1),
"rect": {
"x": round(rect["x"] / cw_f, 4),
"y": round(rect["y"] / ch_f, 4),
"w": round(rect["w"] / cw_f, 4),
"h": round(rect["h"] / ch_f, 4),
"cx": round(rect["cx"] / cw_f, 4),
"cy": round(rect["cy"] / ch_f, 4),
},
"cssWidth": cw,
"cssHeight": ch,
"note": (
"Use css.cx/cy with browser_click_coordinate, "
"browser_hover_coordinate, browser_press_at — "
"CDP Input events operate in CSS pixels. "
"physical.* is debug-only; feeding it to clicks "
"lands them DPR× too far on HiDPI displays."
"rect fields are fractions of the viewport (0..1). "
"Pass rect.cx / rect.cy to browser_click_coordinate / "
"hover_coordinate / press_at."
),
}
@@ -494,11 +471,10 @@ def register_inspection_tools(mcp: FastMCP) -> None:
"""
Get the bounding rect of an element by CSS selector.
Supports '>>>' shadow-piercing selectors for overlay/shadow DOM content.
Returns coordinates in CSS pixels (for clicks and DOM APIs); the
physical-pixel variant is returned for debugging on HiDPI displays
only it must not be fed to click/hover/press tools, which use
CSS pixels.
Supports '>>>' shadow-piercing selectors for overlay/shadow DOM
content. Returns the rect as **fractions of the viewport
(0..1)** the same coordinate space browser_click_coordinate
/ hover_coordinate / press_at expect.
Args:
selector: CSS selector, optionally with ' >>> ' to pierce shadow roots.
@@ -507,7 +483,8 @@ def register_inspection_tools(mcp: FastMCP) -> None:
profile: Browser profile name (default: "default")
Returns:
Dict with css and physical bounding rects
Dict with ``rect`` block (x, y, w, h, cx, cy) as fractions,
plus ``cssWidth`` / ``cssHeight`` for reference.
"""
bridge = get_bridge()
if not bridge or not bridge.is_connected:
@@ -524,36 +501,27 @@ def register_inspection_tools(mcp: FastMCP) -> None:
return result
rect = result["rect"]
physical_scale = _screenshot_scales.get(target_tab, 1.0)
css_scale = _screenshot_css_scales.get(target_tab, 1.0)
dpr = physical_scale / css_scale if css_scale else 1.0
cw, ch = await _ensure_viewport_size(target_tab)
cw_f = float(cw) if cw > 0 else 1.0
ch_f = float(ch) if ch > 0 else 1.0
return {
"ok": True,
"selector": selector,
"tag": rect.get("tag"),
"css": {
"x": rect["x"],
"y": rect["y"],
"w": rect["w"],
"h": rect["h"],
"cx": rect["cx"],
"cy": rect["cy"],
},
"physical": {
"x": round(rect["x"] * dpr, 1),
"y": round(rect["y"] * dpr, 1),
"w": round(rect["w"] * dpr, 1),
"h": round(rect["h"] * dpr, 1),
"cx": round(rect["cx"] * dpr, 1),
"cy": round(rect["cy"] * dpr, 1),
"rect": {
"x": round(rect["x"] / cw_f, 4),
"y": round(rect["y"] / ch_f, 4),
"w": round(rect["w"] / cw_f, 4),
"h": round(rect["h"] / ch_f, 4),
"cx": round(rect["cx"] / cw_f, 4),
"cy": round(rect["cy"] / ch_f, 4),
},
"cssWidth": cw,
"cssHeight": ch,
"note": (
"Use css.cx/cy with browser_click_coordinate, "
"browser_hover_coordinate, browser_press_at — "
"CDP Input events operate in CSS pixels. "
"physical.* is debug-only; feeding it to clicks "
"lands them DPR× too far on HiDPI displays."
"rect fields are fractions of the viewport (0..1). "
"Pass rect.cx / rect.cy to browser_click_coordinate / "
"hover_coordinate / press_at."
),
}
+80 -28
View File
@@ -108,24 +108,31 @@ def register_interaction_tools(mcp: FastMCP) -> None:
button: Literal["left", "right", "middle"] = "left",
) -> dict:
"""
Click at specific viewport coordinates (CSS pixels).
Click at a FRACTION of the viewport (0..1, 0..1).
Chrome DevTools Protocol's Input.dispatchMouseEvent operates in
**CSS pixels**, not physical pixels. If you have a screenshot
image coordinate, convert it with ``browser_coords(x, y)`` and
use the returned ``css_x`` / ``css_y`` not ``physical_x/y``.
On a DPR=2 display, feeding physical coordinates lands the click
at 2× the intended position.
Coordinates are **fractions of the viewport**, not pixels:
``(0.5, 0.5)`` is the center, ``(0.1, 0.2)`` is 10 % from the
left and 20 % from the top. Read a target's proportional
position off ``browser_screenshot`` (or pass
``rect.cx`` / ``rect.cy`` from ``browser_get_rect`` /
``browser_shadow_query`` directly they return fractions too).
Fractions are used because every vision model resizes or tiles
images differently (Claude ~1.15 MP target, GPT-4o 512-px
tiles, etc.). Proportional positions survive every such
transform; pixel coords do not.
Args:
x: X coordinate in CSS pixels (viewport space)
y: Y coordinate in CSS pixels (viewport space)
x: X fraction of the viewport (0..1).
y: Y fraction of the viewport (0..1).
tab_id: Chrome tab ID (default: active tab)
profile: Browser profile name (default: "default")
button: Mouse button to click (left, right, middle)
Returns:
Dict with click result
Dict with click result, including ``focused_element``
describing what the click focused. ``focused_element.rect``
is also in fractions.
"""
start = time.perf_counter()
params = {"x": x, "y": y, "tab_id": tab_id, "profile": profile, "button": button}
@@ -148,18 +155,33 @@ def register_interaction_tools(mcp: FastMCP) -> None:
log_tool_call("browser_click_coordinate", params, result=result)
return result
try:
from .inspection import _screenshot_css_scales, _screenshot_scales
# Pixel-input guard: legitimate fractions live in [0, 1]. Allow a
# small overshoot tolerance for edge targets.
if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
result = {
"ok": False,
"error": (
f"Coords ({x}, {y}) look like pixels. This tool expects "
"fractions 0..1 of the viewport. Read the target's "
"proportional position off browser_screenshot, or pass "
"rect.cx / rect.cy from browser_get_rect / "
"browser_shadow_query (they return fractions)."
),
}
log_tool_call("browser_click_coordinate", params, result=result)
return result
click_result = await bridge.click_coordinate(target_tab, x, y, button=button)
try:
from .inspection import _ensure_viewport_size
cw, ch = await _ensure_viewport_size(target_tab)
css_x = x * cw
css_y = y * ch
click_result = await bridge.click_coordinate(target_tab, css_x, css_y, button=button)
log_tool_call(
"browser_click_coordinate",
params,
result={
**click_result,
"debug_stored_physicalScale": _screenshot_scales.get(target_tab, "unset"),
"debug_stored_cssScale": _screenshot_css_scales.get(target_tab, "unset"),
},
result={**click_result, "cssWidth": cw, "cssHeight": ch},
duration_ms=(time.perf_counter() - start) * 1000,
)
return click_result
@@ -484,15 +506,16 @@ def register_interaction_tools(mcp: FastMCP) -> None:
profile: str | None = None,
) -> dict:
"""
Hover at CSS pixel coordinates without needing a CSS selector.
Hover at a FRACTION of the viewport (0..1, 0..1).
Use this instead of browser_hover when the element is in an overlay,
shadow DOM, or virtual-rendered component that isn't in the regular DOM.
Pair with browser_coords to convert screenshot image positions to CSS pixels.
``x`` / ``y`` are fractions of the viewport (``0.5`` = center);
the tool converts to CSS px internally.
Args:
x: CSS pixel X coordinate
y: CSS pixel Y coordinate
x: X fraction of the viewport (0..1).
y: Y fraction of the viewport (0..1).
tab_id: Chrome tab ID (default: active tab)
profile: Browser profile name (default: "default")
@@ -520,8 +543,22 @@ def register_interaction_tools(mcp: FastMCP) -> None:
log_tool_call("browser_hover_coordinate", params, result=result)
return result
if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
result = {
"ok": False,
"error": (
f"Coords ({x}, {y}) look like pixels. This tool expects "
"fractions 0..1 of the viewport."
),
}
log_tool_call("browser_hover_coordinate", params, result=result)
return result
try:
hover_result = await bridge.hover_coordinate(target_tab, x, y)
from .inspection import _ensure_viewport_size
cw, ch = await _ensure_viewport_size(target_tab)
hover_result = await bridge.hover_coordinate(target_tab, x * cw, y * ch)
log_tool_call(
"browser_hover_coordinate",
params,
@@ -548,16 +585,17 @@ def register_interaction_tools(mcp: FastMCP) -> None:
profile: str | None = None,
) -> dict:
"""
Move mouse to CSS pixel coordinates then press a key.
Move mouse to a FRACTION of the viewport (0..1, 0..1), then press a key.
Use this instead of browser_press when the focused element is in an overlay
or virtual-rendered component. Moving the mouse first routes the key event
through native browser hit-testing instead of the DOM focus chain.
Pair with browser_coords to convert screenshot image positions to CSS pixels.
``x`` / ``y`` are fractions of the viewport; the tool converts
to CSS px internally.
Args:
x: CSS pixel X coordinate to position mouse
y: CSS pixel Y coordinate to position mouse
x: X fraction of the viewport (0..1).
y: Y fraction of the viewport (0..1).
key: Key to press (e.g. 'Enter', 'Space', 'Escape', 'ArrowDown')
tab_id: Chrome tab ID (default: active tab)
profile: Browser profile name (default: "default")
@@ -586,8 +624,22 @@ def register_interaction_tools(mcp: FastMCP) -> None:
log_tool_call("browser_press_at", params, result=result)
return result
if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
result = {
"ok": False,
"error": (
f"Coords ({x}, {y}) look like pixels. This tool expects "
"fractions 0..1 of the viewport."
),
}
log_tool_call("browser_press_at", params, result=result)
return result
try:
press_result = await bridge.press_key_at(target_tab, x, y, key)
from .inspection import _ensure_viewport_size
cw, ch = await _ensure_viewport_size(target_tab)
press_result = await bridge.press_key_at(target_tab, x * cw, y * ch, key)
log_tool_call(
"browser_press_at",
params,
+4 -1
View File
@@ -139,7 +139,10 @@ def main() -> None:
mcp.run(transport="stdio")
else:
logger.info(f"Starting GCU server on {args.host}:{args.port}")
mcp.run(transport="http", host=args.host, port=args.port)
# FastMCP.run() forwards kwargs to anyio.run() instead of the
# transport, which breaks host/port for SSE. Invoke run_async
# directly so the kwargs land on run_sse_async.
asyncio.run(mcp.run_async(transport="sse", host=args.host, port=args.port))
if __name__ == "__main__":