fix: model invariant screenshot
This commit is contained in:
@@ -80,33 +80,57 @@ async def _adaptive_poll_sleep(elapsed_s: float) -> None:
|
||||
_interaction_highlights: dict[int, dict] = {}
|
||||
|
||||
|
||||
# Compact descriptor of document.activeElement. Returned by both click()
|
||||
# Compact descriptor of the focused element. Returned by both click()
|
||||
# and click_coordinate() so the agent can verify it focused what it
|
||||
# intended, then decide whether to follow up with browser_type_focused(text=...).
|
||||
# Keeping this as a single shared string avoids drift
|
||||
# between the two click paths.
|
||||
# intended. When the outer document's activeElement is an <iframe>,
|
||||
# we recurse into the iframe's document (same-origin only) so the
|
||||
# response describes the real inner element — otherwise the agent
|
||||
# always sees {tag: "iframe"} and can't tell whether it hit the
|
||||
# composer or something else inside the frame (e.g. a sidebar item
|
||||
# in LinkedIn's #interop-outlet messaging overlay).
|
||||
_FOCUSED_ELEMENT_JS = """
|
||||
(function() {
|
||||
function describe(el) {
|
||||
var rect = el.getBoundingClientRect();
|
||||
var attrs = {};
|
||||
for (var i = 0; i < el.attributes.length && i < 10; i++) {
|
||||
attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
|
||||
}
|
||||
return {
|
||||
tag: el.tagName.toLowerCase(),
|
||||
id: el.id || null,
|
||||
className: el.className || null,
|
||||
name: el.getAttribute('name') || null,
|
||||
type: el.getAttribute('type') || null,
|
||||
role: el.getAttribute('role') || null,
|
||||
contenteditable: el.getAttribute('contenteditable') || null,
|
||||
text: (el.innerText || '').substring(0, 200),
|
||||
value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
|
||||
attributes: attrs,
|
||||
rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
|
||||
};
|
||||
}
|
||||
var el = document.activeElement;
|
||||
if (!el || el === document.body) return null;
|
||||
var rect = el.getBoundingClientRect();
|
||||
var attrs = {};
|
||||
for (var i = 0; i < el.attributes.length && i < 10; i++) {
|
||||
attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
|
||||
// Descend into same-origin iframes. Capped at 5 levels of
|
||||
// nesting to bound cost. Cross-origin frames throw on
|
||||
// contentDocument access → we catch and report the outermost
|
||||
// iframe instead.
|
||||
var framePath = [];
|
||||
var depth = 0;
|
||||
while (el && (el.tagName === 'IFRAME' || el.tagName === 'FRAME') && depth < 5) {
|
||||
framePath.push(el.id || el.getAttribute('data-testid') || el.tagName.toLowerCase());
|
||||
var innerDoc = null;
|
||||
try { innerDoc = el.contentDocument; } catch (e) { innerDoc = null; }
|
||||
if (!innerDoc) break;
|
||||
var innerActive = innerDoc.activeElement;
|
||||
if (!innerActive || innerActive === innerDoc.body) break;
|
||||
el = innerActive;
|
||||
depth++;
|
||||
}
|
||||
return {
|
||||
tag: el.tagName.toLowerCase(),
|
||||
id: el.id || null,
|
||||
className: el.className || null,
|
||||
name: el.getAttribute('name') || null,
|
||||
type: el.getAttribute('type') || null,
|
||||
role: el.getAttribute('role') || null,
|
||||
contenteditable: el.getAttribute('contenteditable') || null,
|
||||
text: (el.innerText || '').substring(0, 200),
|
||||
value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
|
||||
attributes: attrs,
|
||||
rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
|
||||
};
|
||||
var out = describe(el);
|
||||
if (framePath.length) out.inFrame = framePath;
|
||||
return out;
|
||||
})()
|
||||
"""
|
||||
|
||||
@@ -937,16 +961,33 @@ class BeelineBridge:
|
||||
async def _read_focused_element(self, tab_id: int) -> dict | None:
|
||||
"""Read document.activeElement and return a compact descriptor.
|
||||
|
||||
Returns None on any failure — never raises. Used by both click
|
||||
paths (selector-based click() and click_coordinate()) so the
|
||||
agent gets the same response shape regardless of which one was
|
||||
called. The descriptor lets the agent answer "did my click land
|
||||
on an editable?" without a second round-trip.
|
||||
The JS returns ``rect`` fields in CSS px (they come straight
|
||||
from ``getBoundingClientRect``). We scale them to screenshot
|
||||
pixels here so the agent sees a rect in the same coord space
|
||||
it passed to click / hover / press_at.
|
||||
|
||||
Returns None on any failure — never raises.
|
||||
"""
|
||||
try:
|
||||
await self._try_enable_domain(tab_id, "Runtime")
|
||||
result = await self.evaluate(tab_id, _FOCUSED_ELEMENT_JS)
|
||||
return (result or {}).get("result")
|
||||
info = (result or {}).get("result")
|
||||
if info and isinstance(info, dict) and isinstance(info.get("rect"), dict):
|
||||
# Convert CSS px rect → screenshot px using the cached
|
||||
# scale. Fall back to 1.0 if no screenshot has been
|
||||
# taken yet on this tab.
|
||||
from .tools.inspection import _screenshot_css_scales
|
||||
|
||||
scale = _screenshot_css_scales.get(tab_id, 1.0) or 1.0
|
||||
if scale > 0 and scale != 1.0:
|
||||
r = info["rect"]
|
||||
info["rect"] = {
|
||||
"x": round(r.get("x", 0) / scale, 1),
|
||||
"y": round(r.get("y", 0) / scale, 1),
|
||||
"width": round(r.get("width", 0) / scale, 1),
|
||||
"height": round(r.get("height", 0) / scale, 1),
|
||||
}
|
||||
return info
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@@ -959,18 +1000,11 @@ class BeelineBridge:
|
||||
button_map = {"left": "left", "right": "right", "middle": "middle"}
|
||||
cdp_button = button_map.get(button, "left")
|
||||
|
||||
from .tools.inspection import _screenshot_css_scales, _screenshot_scales
|
||||
|
||||
phys_scale = _screenshot_scales.get(tab_id, "unset")
|
||||
css_scale = _screenshot_css_scales.get(tab_id, "unset")
|
||||
logger.info(
|
||||
"click_coordinate tab=%d: x=%.1f, y=%.1f → CDP Input.dispatchMouseEvent. "
|
||||
"stored_scales: physicalScale=%s, cssScale=%s",
|
||||
"click_coordinate tab=%d: x=%.1f, y=%.1f → CDP Input.dispatchMouseEvent",
|
||||
tab_id,
|
||||
x,
|
||||
y,
|
||||
phys_scale,
|
||||
css_scale,
|
||||
)
|
||||
|
||||
await self._cdp(
|
||||
|
||||
@@ -255,6 +255,16 @@ def register_advanced_tools(mcp: FastMCP) -> None:
|
||||
|
||||
try:
|
||||
result = await bridge.resize(target_tab, width, height)
|
||||
# Invalidate per-tab scale caches — CSS width changed, so the
|
||||
# cached image→CSS multiplier is stale. Click / rect tools
|
||||
# will re-query innerWidth on next use via _ensure_css_scale.
|
||||
try:
|
||||
from .inspection import _screenshot_css_scales, _screenshot_scales
|
||||
|
||||
_screenshot_css_scales.pop(target_tab, None)
|
||||
_screenshot_scales.pop(target_tab, None)
|
||||
except Exception:
|
||||
pass
|
||||
return result
|
||||
except Exception as e:
|
||||
return {"ok": False, "error": str(e)}
|
||||
|
||||
@@ -23,12 +23,21 @@ from .tabs import _get_context
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Target width for normalized screenshots (px in the delivered image)
|
||||
_SCREENSHOT_WIDTH = 600
|
||||
|
||||
# Maps tab_id -> physical scale: image_coord × scale = physical pixels (for CDP Input events)
|
||||
# Fixed output width for all screenshots. Chosen well below Anthropic's
|
||||
# ~1568-px vision-API resize threshold so the image the server emits is
|
||||
# the SAME image (pixel-for-pixel) the LLM sees. That preserves
|
||||
# image_px == model_px, which is the cornerstone of the "LLM works in
|
||||
# screenshot pixels only" contract — all click/hover/press/rect tools
|
||||
# translate between image pixels and CSS pixels internally.
|
||||
_SCREENSHOT_WIDTH = 800
|
||||
|
||||
# Per-tab scale caches populated on every browser_screenshot and on
|
||||
# lazy-init inside the click tools. Both are ``image_px × scale =
|
||||
# target_px`` multipliers.
|
||||
# - _screenshot_scales[tab] → physical scale (image → physical px, debug only)
|
||||
# - _screenshot_css_scales[tab] → css scale (image → CSS px, used for Input events)
|
||||
_screenshot_scales: dict[int, float] = {}
|
||||
# Maps tab_id -> CSS scale: image_coord × scale = CSS pixels (for DOM APIs / getBoundingClientRect)
|
||||
_screenshot_css_scales: dict[int, float] = {}
|
||||
|
||||
|
||||
@@ -37,18 +46,28 @@ def _resize_and_annotate(
|
||||
css_width: int,
|
||||
dpr: float = 1.0,
|
||||
highlights: list[dict] | None = None,
|
||||
width: int = _SCREENSHOT_WIDTH,
|
||||
) -> tuple[str, float, float]:
|
||||
"""Resize a base64 PNG to _SCREENSHOT_WIDTH wide, annotate highlights.
|
||||
"""Resize the captured PNG down to ``_SCREENSHOT_WIDTH`` (=800 px)
|
||||
and re-encode as JPEG quality 75.
|
||||
|
||||
Returns (new_b64, physical_scale, css_scale) where:
|
||||
physical_scale = physical_px_per_image_px (multiply image coords → physical px)
|
||||
css_scale = css_px_per_image_px (multiply image coords → CSS px for DOM APIs)
|
||||
CDP captures at the physical-pixel resolution (DPR × CSS). We
|
||||
downscale to 800 px wide so the delivered image stays under
|
||||
Anthropic's vision-API resize cap — the model sees pixel-for-pixel
|
||||
what we send.
|
||||
|
||||
Highlights have x,y,w,h in CSS pixels (what getBoundingClientRect returns,
|
||||
and what CDP Input.dispatchMouseEvent accepts).
|
||||
Falls back to original data if Pillow unavailable or resize fails.
|
||||
Returns ``(new_b64, physical_scale, css_scale)`` where
|
||||
- ``physical_scale = orig_png_w / _SCREENSHOT_WIDTH`` (image → physical px)
|
||||
- ``css_scale = css_width / _SCREENSHOT_WIDTH`` (image → CSS px)
|
||||
|
||||
Highlight rects arrive in CSS px and are divided by ``css_scale``
|
||||
before drawing so overlays land in the correct spot on the
|
||||
800-wide output.
|
||||
"""
|
||||
if not css_width or css_width <= 0:
|
||||
# Bridge always supplies css_width from window.innerWidth; only
|
||||
# reach here on a degraded response. Return the raw PNG.
|
||||
return data, 1.0, 1.0
|
||||
|
||||
try:
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
except ImportError:
|
||||
@@ -58,21 +77,16 @@ def _resize_and_annotate(
|
||||
import struct
|
||||
|
||||
orig_w = struct.unpack(">I", raw[16:20])[0]
|
||||
raw_size_bytes = len(raw)
|
||||
physical_scale = orig_w / width if orig_w and width else 1.0
|
||||
css_scale = (css_width / width) if css_width else (physical_scale / max(dpr, 1.0))
|
||||
physical_scale = orig_w / _SCREENSHOT_WIDTH if orig_w else 1.0
|
||||
css_scale = css_width / _SCREENSHOT_WIDTH
|
||||
logger.warning(
|
||||
"PIL not available — screenshot resize SKIPPED (cannot downscale image). "
|
||||
"raw_size=%d bytes, png_width=%d, css_width=%s, dpr=%s, target_width=%d. "
|
||||
"Returning ORIGINAL image with computed scales: physicalScale=%.4f, cssScale=%.4f. "
|
||||
"Agent must use browser_coords() to convert image positions before clicking.",
|
||||
raw_size_bytes,
|
||||
orig_w,
|
||||
css_width,
|
||||
dpr,
|
||||
width,
|
||||
"PIL not available — screenshot resize SKIPPED. "
|
||||
"Returning raw physical-px PNG. physicalScale=%.4f, "
|
||||
"cssScale=%.4f, css_width=%d, dpr=%s. Install Pillow for correct clicks.",
|
||||
physical_scale,
|
||||
css_scale,
|
||||
css_width,
|
||||
dpr,
|
||||
)
|
||||
return data, round(physical_scale, 4), round(css_scale, 4)
|
||||
|
||||
@@ -81,25 +95,25 @@ def _resize_and_annotate(
|
||||
img = Image.open(io.BytesIO(raw)).convert("RGBA")
|
||||
orig_w, orig_h = img.size
|
||||
|
||||
physical_scale = orig_w / width
|
||||
css_scale = (css_width / width) if css_width else (physical_scale / max(dpr, 1.0))
|
||||
physical_scale = orig_w / _SCREENSHOT_WIDTH
|
||||
css_scale = css_width / _SCREENSHOT_WIDTH
|
||||
new_w = _SCREENSHOT_WIDTH
|
||||
new_h = round(orig_h * new_w / orig_w)
|
||||
if (new_w, new_h) != img.size:
|
||||
img = img.resize((new_w, new_h), Image.LANCZOS)
|
||||
|
||||
logger.info(
|
||||
"Screenshot resize: orig=%dx%d → target=%dx%d, css_width=%s, dpr=%s, physicalScale=%.4f, cssScale=%.4f",
|
||||
"Screenshot: orig=%dx%d → out=%dx%d (css_width=%d, dpr=%s), physicalScale=%.4f, cssScale=%.4f",
|
||||
orig_w,
|
||||
orig_h,
|
||||
width,
|
||||
round(orig_h * width / orig_w),
|
||||
new_w,
|
||||
new_h,
|
||||
css_width,
|
||||
dpr,
|
||||
physical_scale,
|
||||
css_scale,
|
||||
)
|
||||
|
||||
new_w = width
|
||||
new_h = round(orig_h * new_w / orig_w)
|
||||
img = img.resize((new_w, new_h), Image.LANCZOS)
|
||||
|
||||
if highlights:
|
||||
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
|
||||
draw = ImageDraw.Draw(overlay)
|
||||
@@ -111,7 +125,7 @@ def _resize_and_annotate(
|
||||
for h in highlights:
|
||||
kind = h.get("kind", "rect")
|
||||
label = h.get("label", "")
|
||||
# Highlights are in CSS px → convert to image px
|
||||
# Highlights arrive in CSS px → convert to image px.
|
||||
ix = h["x"] / css_scale
|
||||
iy = h["y"] / css_scale
|
||||
iw = h.get("w", 0) / css_scale
|
||||
@@ -135,11 +149,9 @@ def _resize_and_annotate(
|
||||
width=2,
|
||||
)
|
||||
|
||||
# Label: show image pixel position so user knows where to look
|
||||
img_coords = f"img:({round(ix)},{round(iy)})"
|
||||
display_label = f"{img_coords} {label}" if label else img_coords
|
||||
display_label = f"({round(ix)},{round(iy)}) {label}".strip()
|
||||
lx, ly = ix, max(2, iy - 16)
|
||||
lx = max(2, min(lx, width - 120))
|
||||
lx = max(2, min(lx, new_w - 120))
|
||||
bbox = draw.textbbox((lx, ly), display_label, font=font)
|
||||
pad = 3
|
||||
draw.rectangle(
|
||||
@@ -153,7 +165,7 @@ def _resize_and_annotate(
|
||||
img = img.convert("RGB")
|
||||
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="PNG", optimize=True)
|
||||
img.save(buf, format="JPEG", quality=75, optimize=True)
|
||||
return (
|
||||
base64.b64encode(buf.getvalue()).decode(),
|
||||
round(physical_scale, 4),
|
||||
@@ -161,16 +173,38 @@ def _resize_and_annotate(
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"Screenshot resize/annotate FAILED — returning original image with scale=1.0. "
|
||||
"css_width=%s, dpr=%s, target_width=%d. Clicks will be misaligned.",
|
||||
"Screenshot resize/annotate FAILED — returning original image. "
|
||||
"css_width=%s, dpr=%s.",
|
||||
css_width,
|
||||
dpr,
|
||||
width,
|
||||
exc_info=True,
|
||||
)
|
||||
return data, 1.0, 1.0
|
||||
|
||||
|
||||
async def _ensure_css_scale(tab_id: int) -> float:
|
||||
"""Return the image→CSS scale for ``tab_id``, populating the cache
|
||||
via ``window.innerWidth`` if missing. Used by click tools when the
|
||||
agent clicks before the first screenshot has been taken.
|
||||
"""
|
||||
cached = _screenshot_css_scales.get(tab_id)
|
||||
if cached is not None and cached > 0:
|
||||
return cached
|
||||
bridge = get_bridge()
|
||||
try:
|
||||
result = await bridge.evaluate(tab_id, "({w: window.innerWidth})")
|
||||
inner = float(((result or {}).get("result") or {}).get("w") or 0)
|
||||
except Exception:
|
||||
inner = 0.0
|
||||
if inner <= 0:
|
||||
# Degraded: no viewport width available. Treat image px as CSS px.
|
||||
scale = 1.0
|
||||
else:
|
||||
scale = inner / _SCREENSHOT_WIDTH
|
||||
_screenshot_css_scales[tab_id] = scale
|
||||
return scale
|
||||
|
||||
|
||||
def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
"""Register browser inspection tools."""
|
||||
|
||||
@@ -180,26 +214,24 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
profile: str | None = None,
|
||||
full_page: bool = False,
|
||||
selector: str | None = None,
|
||||
image_type: Literal["png", "jpeg"] = "png",
|
||||
annotate: bool = True,
|
||||
width: int = _SCREENSHOT_WIDTH,
|
||||
) -> list:
|
||||
"""
|
||||
Take a screenshot of the current page.
|
||||
|
||||
Returns a normalized image alongside text metadata (URL, size, scale
|
||||
factors, etc.). Automatically annotates the last interaction (click,
|
||||
hover, type) with a bounding box overlay.
|
||||
Image is 800 px wide (JPEG quality 75, ~50–120 KB). A pixel you
|
||||
see in this image is the same number you pass to
|
||||
``browser_click_coordinate`` / ``browser_hover_coordinate`` /
|
||||
``browser_press_at`` — the tools translate to CSS internally.
|
||||
``browser_get_rect`` and ``browser_shadow_query`` likewise
|
||||
return coordinates in screenshot pixels.
|
||||
|
||||
Args:
|
||||
tab_id: Chrome tab ID (default: active tab)
|
||||
profile: Browser profile name (default: "default")
|
||||
full_page: Capture full scrollable page (default: False)
|
||||
selector: CSS selector to screenshot a specific element (optional)
|
||||
image_type: Image format - png or jpeg (default: png)
|
||||
annotate: Draw bounding box of last interaction on image (default: True)
|
||||
width: Output image width in pixels (default: 600). Use 800+ for fine
|
||||
text, 400 for quick layout checks.
|
||||
|
||||
Returns:
|
||||
List of content blocks: text metadata + image
|
||||
@@ -252,7 +284,6 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
return [TextContent(type="text", text=json.dumps(screenshot_result))]
|
||||
|
||||
data = screenshot_result.get("data")
|
||||
mime_type = screenshot_result.get("mimeType", "image/png")
|
||||
css_width = screenshot_result.get("cssWidth", 0)
|
||||
dpr = screenshot_result.get("devicePixelRatio", 1.0)
|
||||
|
||||
@@ -263,45 +294,45 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
if annotate and target_tab in _interaction_highlights:
|
||||
highlights = [_interaction_highlights[target_tab]]
|
||||
|
||||
# Normalize to 800px wide and annotate. Offloaded to a
|
||||
# thread because PIL Image.open/resize/ImageDraw/composite on
|
||||
# a 2-megapixel PNG blocks for ~150-300ms of CPU — plenty to
|
||||
# freeze the asyncio event loop and delay every concurrent
|
||||
# tool call during a screenshot. The function is reentrant
|
||||
# (fresh PIL Image per call, no shared state), so to_thread
|
||||
# is safe.
|
||||
# Resize to CSS-viewport dimensions (image px == CSS px)
|
||||
# and re-encode as JPEG. Offloaded to a thread because PIL
|
||||
# Image.open/resize/ImageDraw/composite on a 2-megapixel
|
||||
# PNG blocks for ~150–300 ms of CPU — plenty to freeze the
|
||||
# asyncio event loop. Reentrant: no shared state.
|
||||
data, physical_scale, css_scale = await asyncio.to_thread(
|
||||
_resize_and_annotate,
|
||||
data,
|
||||
css_width,
|
||||
dpr,
|
||||
highlights,
|
||||
width,
|
||||
)
|
||||
_screenshot_scales[target_tab] = physical_scale
|
||||
_screenshot_css_scales[target_tab] = css_scale
|
||||
# Refresh caches so click / hover / press / rect tools can
|
||||
# translate image px ↔ CSS px without asking the page again.
|
||||
if target_tab is not None:
|
||||
_screenshot_scales[target_tab] = physical_scale
|
||||
_screenshot_css_scales[target_tab] = css_scale
|
||||
|
||||
meta = json.dumps(
|
||||
{
|
||||
"ok": True,
|
||||
"tabId": target_tab,
|
||||
"url": screenshot_result.get("url", ""),
|
||||
"imageType": mime_type.split("/")[-1],
|
||||
"imageType": "jpeg",
|
||||
"size": len(base64.b64decode(data)) if data else 0,
|
||||
"imageWidth": width,
|
||||
"imageWidth": _SCREENSHOT_WIDTH,
|
||||
"cssWidth": css_width,
|
||||
"fullPage": full_page,
|
||||
"devicePixelRatio": dpr,
|
||||
"physicalScale": physical_scale,
|
||||
"cssScale": css_scale,
|
||||
"annotated": bool(highlights),
|
||||
"scaleHint": (
|
||||
f"image_coord × {css_scale} = CSS px "
|
||||
f"→ feed to browser_click_coordinate, "
|
||||
f"browser_hover_coordinate, browser_press_at "
|
||||
f"(CDP Input events use CSS pixels). "
|
||||
f"image_coord × {physical_scale} = physical px "
|
||||
f"is debug-only on HiDPI displays and must NOT "
|
||||
f"be used for clicks — it overshoots by DPR×."
|
||||
"Image is 800 px wide. Pass pixel coordinates "
|
||||
"you read off this image straight into "
|
||||
"browser_click_coordinate / "
|
||||
"browser_hover_coordinate / browser_press_at — "
|
||||
"the tools translate image px → CSS px "
|
||||
"internally (cssScale is for debug only)."
|
||||
),
|
||||
}
|
||||
)
|
||||
@@ -313,17 +344,17 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
"ok": True,
|
||||
"size": len(base64.b64decode(data)) if data else 0,
|
||||
"url": screenshot_result.get("url", ""),
|
||||
"physicalScale": physical_scale,
|
||||
"cssWidth": css_width,
|
||||
"cssScale": css_scale,
|
||||
"debug_cssWidth": css_width,
|
||||
"debug_dpr": dpr,
|
||||
"physicalScale": physical_scale,
|
||||
"dpr": dpr,
|
||||
},
|
||||
duration_ms=(time.perf_counter() - start) * 1000,
|
||||
)
|
||||
|
||||
return [
|
||||
TextContent(type="text", text=meta),
|
||||
ImageContent(type="image", data=data, mimeType=mime_type),
|
||||
ImageContent(type="image", data=data, mimeType="image/jpeg"),
|
||||
]
|
||||
except Exception as e:
|
||||
log_tool_call(
|
||||
@@ -334,73 +365,6 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
)
|
||||
return [TextContent(type="text", text=json.dumps({"ok": False, "error": str(e)}))]
|
||||
|
||||
@mcp.tool()
|
||||
def browser_coords(
|
||||
x: float,
|
||||
y: float,
|
||||
tab_id: int | None = None,
|
||||
profile: str | None = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Convert screenshot image coordinates to browser click coordinates.
|
||||
|
||||
After browser_screenshot returns a downscaled image, use this to
|
||||
translate pixel positions you see in the image into the CSS pixel
|
||||
coordinates that Chrome DevTools Protocol expects.
|
||||
|
||||
**CDP Input.dispatchMouseEvent uses CSS pixels**, so you want
|
||||
``css_x`` / ``css_y`` for every click/hover tool. ``physical_x/y``
|
||||
is kept in the return for debugging on HiDPI displays — do NOT
|
||||
feed it to clicks; on a DPR=2 screen it lands 2× too far.
|
||||
|
||||
Edge case: pages using ``zoom`` or ``transform: scale()`` (e.g.
|
||||
LinkedIn's ``#interop-outlet`` shadow DOM) render in a scaled
|
||||
local coordinate space. For those, ``getBoundingClientRect()``
|
||||
reports pre-zoom coordinates and you may still need to multiply
|
||||
by the element's effective zoom. Use browser_shadow_query to
|
||||
get the zoomed rect directly.
|
||||
|
||||
Args:
|
||||
x: X pixel position in the screenshot image
|
||||
y: Y pixel position in the screenshot image
|
||||
tab_id: Chrome tab ID (default: active tab for profile)
|
||||
profile: Browser profile name (default: "default")
|
||||
|
||||
Returns:
|
||||
Dict with css_x, css_y (primary — use these), physical_x,
|
||||
physical_y (debug only), and scale factors.
|
||||
"""
|
||||
ctx = _get_context(profile)
|
||||
target_tab = tab_id or (ctx.get("activeTabId") if ctx else None)
|
||||
|
||||
physical_scale = _screenshot_scales.get(target_tab, 1.0) if target_tab else 1.0
|
||||
# css_scale stored in second slot via _screenshot_css_scales
|
||||
css_scale = _screenshot_css_scales.get(target_tab, physical_scale) if target_tab else physical_scale
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
# Primary output: CSS pixels. Feed these to click/hover/press.
|
||||
"css_x": round(x * css_scale, 1),
|
||||
"css_y": round(y * css_scale, 1),
|
||||
# Debug output: raw physical pixels. DO NOT feed to clicks on
|
||||
# HiDPI displays — CDP Input events use CSS pixels, so sending
|
||||
# physical coordinates lands the click at roughly DPR× the
|
||||
# intended position.
|
||||
"physical_x": round(x * physical_scale, 1),
|
||||
"physical_y": round(y * physical_scale, 1),
|
||||
"physicalScale": physical_scale,
|
||||
"cssScale": css_scale,
|
||||
"tabId": target_tab,
|
||||
"note": (
|
||||
"Use css_x/css_y with browser_click_coordinate, "
|
||||
"browser_hover_coordinate, browser_press_at — "
|
||||
"Chrome DevTools Protocol Input.dispatchMouseEvent "
|
||||
"operates in CSS pixels. physical_x/y is for debugging "
|
||||
"on HiDPI displays only; feeding it to clicks lands "
|
||||
"them at DPR× the intended coordinate."
|
||||
),
|
||||
}
|
||||
|
||||
@mcp.tool()
|
||||
async def browser_shadow_query(
|
||||
selector: str,
|
||||
@@ -412,7 +376,9 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
|
||||
Traverses shadow roots to find elements inside closed/open shadow DOM,
|
||||
overlays, and virtual-rendered components (e.g. LinkedIn's #interop-outlet).
|
||||
Returns getBoundingClientRect in both CSS and physical pixels.
|
||||
Returns the element's bounding rect in screenshot pixels — feed
|
||||
``rect.cx`` / ``rect.cy`` straight into browser_click_coordinate
|
||||
/ hover_coordinate / press_at.
|
||||
|
||||
Args:
|
||||
selector: CSS selectors joined by ' >>> ' to pierce shadow roots.
|
||||
@@ -421,7 +387,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
profile: Browser profile name (default: "default")
|
||||
|
||||
Returns:
|
||||
Dict with rect (CSS px) and physical rect (CSS px × DPR) of the element
|
||||
Dict with ``rect`` block (x, y, w, h, cx, cy) in screenshot pixels.
|
||||
"""
|
||||
bridge = get_bridge()
|
||||
if not bridge or not bridge.is_connected:
|
||||
@@ -438,36 +404,24 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
return result
|
||||
|
||||
rect = result["rect"]
|
||||
physical_scale = _screenshot_scales.get(target_tab, 1.0)
|
||||
css_scale = _screenshot_css_scales.get(target_tab, 1.0)
|
||||
dpr = physical_scale / css_scale if css_scale else 1.0
|
||||
|
||||
css_scale = await _ensure_css_scale(target_tab)
|
||||
s = css_scale if css_scale > 0 else 1.0
|
||||
return {
|
||||
"ok": True,
|
||||
"selector": selector,
|
||||
"tag": rect.get("tag"),
|
||||
"css": {
|
||||
"x": rect["x"],
|
||||
"y": rect["y"],
|
||||
"w": rect["w"],
|
||||
"h": rect["h"],
|
||||
"cx": rect["cx"],
|
||||
"cy": rect["cy"],
|
||||
},
|
||||
"physical": {
|
||||
"x": round(rect["x"] * dpr, 1),
|
||||
"y": round(rect["y"] * dpr, 1),
|
||||
"w": round(rect["w"] * dpr, 1),
|
||||
"h": round(rect["h"] * dpr, 1),
|
||||
"cx": round(rect["cx"] * dpr, 1),
|
||||
"cy": round(rect["cy"] * dpr, 1),
|
||||
"rect": {
|
||||
"x": round(rect["x"] / s, 1),
|
||||
"y": round(rect["y"] / s, 1),
|
||||
"w": round(rect["w"] / s, 1),
|
||||
"h": round(rect["h"] / s, 1),
|
||||
"cx": round(rect["cx"] / s, 1),
|
||||
"cy": round(rect["cy"] / s, 1),
|
||||
},
|
||||
"note": (
|
||||
"Use css.cx/cy with browser_click_coordinate, "
|
||||
"browser_hover_coordinate, browser_press_at — "
|
||||
"CDP Input events operate in CSS pixels. "
|
||||
"physical.* is debug-only; feeding it to clicks "
|
||||
"lands them DPR× too far on HiDPI displays."
|
||||
"rect fields are in screenshot pixels. Pass rect.cx / "
|
||||
"rect.cy to browser_click_coordinate / "
|
||||
"hover_coordinate / press_at."
|
||||
),
|
||||
}
|
||||
|
||||
@@ -480,11 +434,10 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
"""
|
||||
Get the bounding rect of an element by CSS selector.
|
||||
|
||||
Supports '>>>' shadow-piercing selectors for overlay/shadow DOM content.
|
||||
Returns coordinates in CSS pixels (for clicks and DOM APIs); the
|
||||
physical-pixel variant is returned for debugging on HiDPI displays
|
||||
only — it must not be fed to click/hover/press tools, which use
|
||||
CSS pixels.
|
||||
Supports '>>>' shadow-piercing selectors for overlay/shadow DOM
|
||||
content. Returns the rect in screenshot pixels — the same
|
||||
numbers you'd read off a browser_screenshot, and the same
|
||||
numbers browser_click_coordinate expects.
|
||||
|
||||
Args:
|
||||
selector: CSS selector, optionally with ' >>> ' to pierce shadow roots.
|
||||
@@ -493,7 +446,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
profile: Browser profile name (default: "default")
|
||||
|
||||
Returns:
|
||||
Dict with css and physical bounding rects
|
||||
Dict with ``rect`` block (x, y, w, h, cx, cy) in screenshot pixels.
|
||||
"""
|
||||
bridge = get_bridge()
|
||||
if not bridge or not bridge.is_connected:
|
||||
@@ -510,36 +463,24 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
return result
|
||||
|
||||
rect = result["rect"]
|
||||
physical_scale = _screenshot_scales.get(target_tab, 1.0)
|
||||
css_scale = _screenshot_css_scales.get(target_tab, 1.0)
|
||||
dpr = physical_scale / css_scale if css_scale else 1.0
|
||||
|
||||
css_scale = await _ensure_css_scale(target_tab)
|
||||
s = css_scale if css_scale > 0 else 1.0
|
||||
return {
|
||||
"ok": True,
|
||||
"selector": selector,
|
||||
"tag": rect.get("tag"),
|
||||
"css": {
|
||||
"x": rect["x"],
|
||||
"y": rect["y"],
|
||||
"w": rect["w"],
|
||||
"h": rect["h"],
|
||||
"cx": rect["cx"],
|
||||
"cy": rect["cy"],
|
||||
},
|
||||
"physical": {
|
||||
"x": round(rect["x"] * dpr, 1),
|
||||
"y": round(rect["y"] * dpr, 1),
|
||||
"w": round(rect["w"] * dpr, 1),
|
||||
"h": round(rect["h"] * dpr, 1),
|
||||
"cx": round(rect["cx"] * dpr, 1),
|
||||
"cy": round(rect["cy"] * dpr, 1),
|
||||
"rect": {
|
||||
"x": round(rect["x"] / s, 1),
|
||||
"y": round(rect["y"] / s, 1),
|
||||
"w": round(rect["w"] / s, 1),
|
||||
"h": round(rect["h"] / s, 1),
|
||||
"cx": round(rect["cx"] / s, 1),
|
||||
"cy": round(rect["cy"] / s, 1),
|
||||
},
|
||||
"note": (
|
||||
"Use css.cx/cy with browser_click_coordinate, "
|
||||
"browser_hover_coordinate, browser_press_at — "
|
||||
"CDP Input events operate in CSS pixels. "
|
||||
"physical.* is debug-only; feeding it to clicks "
|
||||
"lands them DPR× too far on HiDPI displays."
|
||||
"rect fields are in screenshot pixels. Pass rect.cx / "
|
||||
"rect.cy to browser_click_coordinate / "
|
||||
"hover_coordinate / press_at."
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
@@ -108,24 +108,25 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
button: Literal["left", "right", "middle"] = "left",
|
||||
) -> dict:
|
||||
"""
|
||||
Click at specific viewport coordinates (CSS pixels).
|
||||
Click at the given SCREENSHOT pixel.
|
||||
|
||||
Chrome DevTools Protocol's Input.dispatchMouseEvent operates in
|
||||
**CSS pixels**, not physical pixels. If you have a screenshot
|
||||
image coordinate, convert it with ``browser_coords(x, y)`` and
|
||||
use the returned ``css_x`` / ``css_y`` — not ``physical_x/y``.
|
||||
On a DPR=2 display, feeding physical coordinates lands the click
|
||||
at 2× the intended position.
|
||||
``x`` and ``y`` are pixel coordinates read directly off a
|
||||
``browser_screenshot`` image (800 px wide JPEG). The tool
|
||||
multiplies them by the cached image→CSS scale for the tab
|
||||
before dispatching to Chrome — no scale awareness required on
|
||||
the caller side. ``browser_get_rect`` / ``browser_shadow_query``
|
||||
return coordinates in the same (screenshot) space.
|
||||
|
||||
Args:
|
||||
x: X coordinate in CSS pixels (viewport space)
|
||||
y: Y coordinate in CSS pixels (viewport space)
|
||||
x: X coordinate in screenshot pixels.
|
||||
y: Y coordinate in screenshot pixels.
|
||||
tab_id: Chrome tab ID (default: active tab)
|
||||
profile: Browser profile name (default: "default")
|
||||
button: Mouse button to click (left, right, middle)
|
||||
|
||||
Returns:
|
||||
Dict with click result
|
||||
Dict with click result, including ``focused_element``
|
||||
describing what the click focused.
|
||||
"""
|
||||
start = time.perf_counter()
|
||||
params = {"x": x, "y": y, "tab_id": tab_id, "profile": profile, "button": button}
|
||||
@@ -149,17 +150,17 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
return result
|
||||
|
||||
try:
|
||||
from .inspection import _screenshot_css_scales, _screenshot_scales
|
||||
from .inspection import _ensure_css_scale
|
||||
|
||||
click_result = await bridge.click_coordinate(target_tab, x, y, button=button)
|
||||
css_scale = await _ensure_css_scale(target_tab)
|
||||
s = css_scale if css_scale > 0 else 1.0
|
||||
css_x = x * s
|
||||
css_y = y * s
|
||||
click_result = await bridge.click_coordinate(target_tab, css_x, css_y, button=button)
|
||||
log_tool_call(
|
||||
"browser_click_coordinate",
|
||||
params,
|
||||
result={
|
||||
**click_result,
|
||||
"debug_stored_physicalScale": _screenshot_scales.get(target_tab, "unset"),
|
||||
"debug_stored_cssScale": _screenshot_css_scales.get(target_tab, "unset"),
|
||||
},
|
||||
result={**click_result, "cssScale": round(css_scale, 4)},
|
||||
duration_ms=(time.perf_counter() - start) * 1000,
|
||||
)
|
||||
return click_result
|
||||
@@ -484,15 +485,17 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
profile: str | None = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Hover at CSS pixel coordinates without needing a CSS selector.
|
||||
Hover at the given SCREENSHOT pixel.
|
||||
|
||||
Use this instead of browser_hover when the element is in an overlay,
|
||||
shadow DOM, or virtual-rendered component that isn't in the regular DOM.
|
||||
Pair with browser_coords to convert screenshot image positions to CSS pixels.
|
||||
``x`` / ``y`` are pixel coordinates read directly off a
|
||||
``browser_screenshot`` image; the tool translates to CSS px
|
||||
internally before dispatching to Chrome.
|
||||
|
||||
Args:
|
||||
x: CSS pixel X coordinate
|
||||
y: CSS pixel Y coordinate
|
||||
x: X coordinate in screenshot pixels.
|
||||
y: Y coordinate in screenshot pixels.
|
||||
tab_id: Chrome tab ID (default: active tab)
|
||||
profile: Browser profile name (default: "default")
|
||||
|
||||
@@ -521,7 +524,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
return result
|
||||
|
||||
try:
|
||||
hover_result = await bridge.hover_coordinate(target_tab, x, y)
|
||||
from .inspection import _ensure_css_scale
|
||||
|
||||
css_scale = await _ensure_css_scale(target_tab)
|
||||
s = css_scale if css_scale > 0 else 1.0
|
||||
hover_result = await bridge.hover_coordinate(target_tab, x * s, y * s)
|
||||
log_tool_call(
|
||||
"browser_hover_coordinate",
|
||||
params,
|
||||
@@ -548,16 +555,18 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
profile: str | None = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Move mouse to CSS pixel coordinates then press a key.
|
||||
Move mouse to the given SCREENSHOT pixel, then press a key.
|
||||
|
||||
Use this instead of browser_press when the focused element is in an overlay
|
||||
or virtual-rendered component. Moving the mouse first routes the key event
|
||||
through native browser hit-testing instead of the DOM focus chain.
|
||||
Pair with browser_coords to convert screenshot image positions to CSS pixels.
|
||||
``x`` / ``y`` are pixel coordinates read directly off a
|
||||
``browser_screenshot`` image; the tool translates to CSS px
|
||||
internally.
|
||||
|
||||
Args:
|
||||
x: CSS pixel X coordinate to position mouse
|
||||
y: CSS pixel Y coordinate to position mouse
|
||||
x: X coordinate in screenshot pixels.
|
||||
y: Y coordinate in screenshot pixels.
|
||||
key: Key to press (e.g. 'Enter', 'Space', 'Escape', 'ArrowDown')
|
||||
tab_id: Chrome tab ID (default: active tab)
|
||||
profile: Browser profile name (default: "default")
|
||||
@@ -587,7 +596,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
return result
|
||||
|
||||
try:
|
||||
press_result = await bridge.press_key_at(target_tab, x, y, key)
|
||||
from .inspection import _ensure_css_scale
|
||||
|
||||
css_scale = await _ensure_css_scale(target_tab)
|
||||
s = css_scale if css_scale > 0 else 1.0
|
||||
press_result = await bridge.press_key_at(target_tab, x * s, y * s, key)
|
||||
log_tool_call(
|
||||
"browser_press_at",
|
||||
params,
|
||||
|
||||
@@ -139,7 +139,10 @@ def main() -> None:
|
||||
mcp.run(transport="stdio")
|
||||
else:
|
||||
logger.info(f"Starting GCU server on {args.host}:{args.port}")
|
||||
mcp.run(transport="http", host=args.host, port=args.port)
|
||||
# FastMCP.run() forwards kwargs to anyio.run() instead of the
|
||||
# transport, which breaks host/port for SSE. Invoke run_async
|
||||
# directly so the kwargs land on run_sse_async.
|
||||
asyncio.run(mcp.run_async(transport="sse", host=args.host, port=args.port))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user