fix: diagnostics

This commit is contained in:
Timothy
2026-04-19 12:52:04 -07:00
parent ddaafe0307
commit e55cea97ef
5 changed files with 436 additions and 53 deletions
+3 -1
View File
@@ -57,7 +57,9 @@
"mcp__gcu-tools__browser_type_focused",
"mcp__gcu-tools__browser_wait",
"Bash(python3 -c ' *)",
"Bash(python3 scripts/debug_queen_prompt.py independent)"
"Bash(python3 scripts/debug_queen_prompt.py independent)",
"Bash(curl -s --max-time 2 http://127.0.0.1:9230/status)",
"Bash(python3 -c \"import json, sys; print\\(json.loads\\(sys.stdin.read\\(\\)\\)['data']['content']\\)\")"
],
"additionalDirectories": [
"/home/timothy/.hive/skills/writing-hive-skills",
+28
View File
@@ -211,6 +211,34 @@ chrome.runtime.onMessage.addListener((msg, _sender, sendResponse) => {
chrome.runtime.onInstalled.addListener(ensureOffscreen);
chrome.runtime.onStartup.addListener(ensureOffscreen);
// ---------------------------------------------------------------------------
// CDP event forwarder — diagnostic channel
// ---------------------------------------------------------------------------
//
// chrome.debugger.sendCommand (the cdp handler above) only responds to
// requests. CDP also emits unsolicited EVENTS (Runtime.consoleAPICalled,
// Page.frameResized, Target.targetInfoChanged, …) that the bridge doesn't
// see today. Forward the narrow subset we're currently diagnosing so the
// Python side can correlate viewport changes with page lifecycle events.
// Filtered at the source to keep the wire slim.
const FORWARDED_CDP_EVENTS = new Set([
"Runtime.consoleAPICalled",
"Page.lifecycleEvent",
"Page.frameResized",
"Page.frameNavigated",
"Target.targetInfoChanged",
]);
chrome.debugger.onEvent.addListener((source, method, params) => {
if (!FORWARDED_CDP_EVENTS.has(method)) return;
wsSend({
type: "cdp_event",
tabId: source.tabId,
method,
params: params ?? {},
});
});
// Periodic alarm keeps the service worker from being garbage-collected and
// recreates the offscreen document if it was evicted.
chrome.alarms.create("keepAlive", { periodInMinutes: 0.4 });
+231 -2
View File
@@ -166,6 +166,41 @@ _HIT_ELEMENT_JS = """
"""
# Diagnostic probe — installs viewport/visibility listeners on the page
# and posts their observations through console.info so the CDP event
# channel (Runtime.consoleAPICalled) forwards them to our telemetry.
# Idempotent via ``window.__hive_vp_instrumented``.
_HIVE_VP_PROBE_JS = """
(function () {
if (window.__hive_vp_instrumented) return;
window.__hive_vp_instrumented = true;
function sample(kind) {
try {
console.info('[hive_vp]', JSON.stringify({
kind: kind,
innerWidth: window.innerWidth,
innerHeight: window.innerHeight,
visualW: window.visualViewport && window.visualViewport.width,
visualH: window.visualViewport && window.visualViewport.height,
docHidden: document.hidden,
visibilityState: document.visibilityState,
scrollX: window.scrollX,
scrollY: window.scrollY,
dpr: window.devicePixelRatio,
ts: Date.now()
}));
} catch (e) {}
}
sample('init');
window.addEventListener('resize', function () { sample('resize'); });
if (window.visualViewport) {
window.visualViewport.addEventListener('resize', function () { sample('visualResize'); });
}
document.addEventListener('visibilitychange', function () { sample('visibility'); });
})();
"""
_FOCUSED_ELEMENT_JS = """
(function() {
function describe(el) {
@@ -368,6 +403,23 @@ class BeelineBridge:
log_connection_event("hello", {"version": msg.get("version")})
continue
if msg.get("type") == "cdp_event":
# Unsolicited CDP event forwarded by the extension.
# Narrow diagnostic channel — see FORWARDED_CDP_EVENTS
# in browser-extension/background.js. We pick out
# the [hive_vp] console probe as a structured
# viewport_event telemetry entry and also log the
# raw event for correlation with page lifecycle.
try:
self._handle_cdp_event(
msg.get("tabId"),
msg.get("method", ""),
msg.get("params") or {},
)
except Exception:
pass
continue
msg_id = msg.get("id")
if msg_id and msg_id in self._pending:
fut = self._pending.pop(msg_id)
@@ -392,6 +444,84 @@ class BeelineBridge:
fut.cancel()
self._pending.clear()
def _handle_cdp_event(self, tab_id: int | None, method: str, params: dict) -> None:
"""Decode a CDP event forwarded from the extension and route it
to telemetry. Keep this method sync and best-effort a bad
event must never break the bridge's read loop.
Runtime.consoleAPICalled with our ``[hive_vp]`` prefix is
split off as a structured ``viewport_event`` entry so the
reader can ``grep`` it without touching the raw console log.
All other forwarded events are logged verbatim under
``cdp_event`` so we can correlate viewport changes with
lifecycle / resize / target-info events.
"""
from .telemetry import write_log
if method == "Runtime.consoleAPICalled":
args = params.get("args") or []
first = args[0].get("value") if args and isinstance(args[0], dict) else None
payload = args[1].get("value") if len(args) >= 2 and isinstance(args[1], dict) else None
# Structured [hive_vp] viewport probe → viewport_event
if first == "[hive_vp]" and isinstance(payload, str):
try:
parsed = json.loads(payload)
except Exception:
parsed = {"_raw": payload}
write_log({
"type": "viewport_event",
"tab_id": tab_id,
**parsed,
})
return
# Attach-time canary → attach_canary (proves extension
# forwarder is alive end-to-end).
if first == "[hive_attach_canary]" and isinstance(payload, str):
try:
parsed = json.loads(payload)
except Exception:
parsed = {"_raw": payload}
write_log({
"type": "attach_canary",
"tab_id": tab_id,
**parsed,
})
return
# Everything else — keep a compact row so we can tell
# whether ANY console output is flowing through the
# pipe. Truncate each arg so a chatty page can't flood
# the log.
compact = []
for a in args[:4]:
if not isinstance(a, dict):
continue
v = a.get("value")
if isinstance(v, str):
compact.append(v[:120])
elif v is not None:
compact.append(str(v)[:120])
write_log({
"type": "cdp_event",
"tab_id": tab_id,
"method": method,
"level": params.get("type"),
"args": compact,
})
return
# Other forwarded events (Page.lifecycleEvent, frameResized,
# frameNavigated, Target.targetInfoChanged) are rare and high
# signal — keep the full param dict but truncate strings.
write_log({
"type": "cdp_event",
"tab_id": tab_id,
"method": method,
"params": params,
})
# Default wait on a bridge command. Callers with known-slow ops
# (full-page screenshots on slow networks, AX tree on huge pages)
# can pass a longer value via _send(..., timeout=...). Using the
@@ -594,12 +724,111 @@ class BeelineBridge:
"""Attach CDP debugger to a tab.
Returns {"ok": bool}.
First-attach-per-tab triggers Chrome's "<extension> started
debugging this browser" infobar, which shrinks the layout
viewport by ~3070 CSS px. The banner's commit is async from
the attach return, so a screenshot taken immediately after
can capture the pre-banner layout, leaving the viewport
cache stale until the next screenshot or
``_ensure_viewport_size`` call. We wait a short grace here
and proactively prime the viewport cache with the settled
(post-banner) dimensions, so the very first coord-conversion
after attach already operates on the real frame.
"""
if tab_id in self._cdp_attached:
return {"ok": True, "attached": False, "message": "Already attached"}
result = await self._send("cdp.attach", tabId=tab_id)
if result.get("ok"):
self._cdp_attached.add(tab_id)
if not result.get("ok"):
return result
self._cdp_attached.add(tab_id)
# Prime the viewport cache so the first coord-conversion
# after attach has a reasonable seed. Also install the
# diagnostic viewport-change probe ([hive_vp] console
# messages that stream through our CDP-event channel).
# Failures are silent — cache will heal on next screenshot
# or _ensure_viewport_size call.
try:
from .tools.inspection import _viewport_sizes
eval_res = await self._cdp(
tab_id,
"Runtime.evaluate",
{
"expression": "({w: window.innerWidth, h: window.innerHeight})",
"returnByValue": True,
},
)
inner = (eval_res or {}).get("result", {}).get("value") or {}
cw = int(float(inner.get("w") or 0))
ch = int(float(inner.get("h") or 0))
if cw > 0 and ch > 0:
_viewport_sizes[tab_id] = (cw, ch)
except Exception:
pass
# Runtime must be enabled for consoleAPICalled events to
# fire; Page must be enabled for frame* / lifecycle events
# to reach the extension. Page.setLifecycleEventsEnabled
# is the critical one — without it Chrome withholds the
# DOMContentLoaded / load / firstMeaningfulPaint stream.
# Each wrapped in try so a failure on one domain doesn't
# block the others.
try:
await self._cdp(tab_id, "Runtime.enable", {})
except Exception:
pass
try:
await self._cdp(tab_id, "Page.enable", {})
except Exception:
pass
try:
await self._cdp(tab_id, "Page.setLifecycleEventsEnabled", {"enabled": True})
except Exception:
pass
# [hive_vp] probe — install resize / visibility listeners on
# the page so Chrome tells us when the renderer sees a
# viewport change. Uses console.info as a cheap transport
# through CDP; filtered server-side by the cdp_event
# handler. Idempotent via __hive_vp_instrumented.
try:
await self._cdp(
tab_id,
"Runtime.evaluate",
{
"expression": _HIVE_VP_PROBE_JS,
"returnByValue": True,
"awaitPromise": False,
},
)
except Exception:
pass
# Canary — emit a recognisable marker from the page so we
# can verify end-to-end (page → CDP → extension → bridge →
# telemetry) is wired. Should produce one ``cdp_event``
# with method=Runtime.consoleAPICalled whose args start
# ``[hive_attach_canary]``. Zero canary entries after a
# run means the extension forwarder is stale and the user
# needs to reload the Hive extension in chrome://extensions.
try:
await self._cdp(
tab_id,
"Runtime.evaluate",
{
"expression": (
"console.info('[hive_attach_canary]', "
"JSON.stringify({tabId: "
+ str(tab_id) + ", ts: Date.now()}))"
),
"returnByValue": True,
"awaitPromise": False,
},
)
except Exception:
pass
return result
async def cdp_detach(self, tab_id: int) -> dict:
+59 -21
View File
@@ -46,16 +46,17 @@ _screenshot_scales: dict[int, float] = {}
def clear_tab_state(tab_ids) -> None:
"""Drop cached screenshot scales for the given tab_ids.
"""Drop cached screenshot scales and viewport sizes for the given tab_ids.
Called when a tab closes or a profile's context is destroyed so stale
scale values can't bleed into a later tab that Chrome happens to assign
cache values can't bleed into a later tab that Chrome happens to assign
the same id. Accepts a single id or any iterable.
"""
if isinstance(tab_ids, int):
tab_ids = (tab_ids,)
for tid in tab_ids:
_screenshot_scales.pop(tid, None)
_viewport_sizes.pop(tid, None)
def _resize_and_annotate(
@@ -195,34 +196,71 @@ def _resize_and_annotate(
return data, 1.0
async def _ensure_viewport_size(tab_id: int) -> tuple[int, int]:
"""Return ``(cssWidth, cssHeight)`` for ``tab_id``, populating the
cache via ``window.innerWidth`` / ``window.innerHeight`` on miss.
async def _ensure_viewport_size(tab_id: int, _caller: str = "unknown") -> tuple[int, int]:
"""Return ``(cssWidth, cssHeight)`` for ``tab_id``, always
refreshing from ``window.innerWidth`` / ``window.innerHeight``.
Used by click / hover / press tools to turn fractional inputs
(0..1) into CSS px, and by rect tools to turn CSS-px rects into
fractions. Degrades to ``(1, 1)`` if the bridge can't be queried
that makes every coord an identity op, which is a safe no-op
(and preferable to crashing).
fractions.
Every call emits a ``viewport_sample`` telemetry entry so we
can build a timeline of Chrome's reported viewport across an
agent run needed to diagnose the sessions where cssH changes
silently (no visible layout shift) between screenshot and
click. The entry records the live value, the cached value, and
the delta so the transition point is trivial to locate in
``~/.hive/browser-logs/browser-YYYY-MM-DD.jsonl``.
Falls back to the cached value on evaluate failure, then to
``(1, 1)`` if there's no cache — identity-op is a safe no-op.
"""
cached = _viewport_sizes.get(tab_id)
if cached is not None and cached[0] > 0 and cached[1] > 0:
return cached
bridge = get_bridge()
cw = ch = 0
evaluate_error: str | None = None
try:
result = await bridge.evaluate(tab_id, "({w: window.innerWidth, h: window.innerHeight})")
inner = (result or {}).get("result") or {}
cw = int(float(inner.get("w") or 0))
ch = int(float(inner.get("h") or 0))
except Exception:
cw, ch = 0, 0
except Exception as e:
evaluate_error = str(e)
cw = ch = 0
cached_before = _viewport_sizes.get(tab_id)
if cw <= 0 or ch <= 0:
# Degraded: bridge didn't return viewport. Cache an identity
# so we don't retry on every call; corrects itself after the
# next successful browser_screenshot.
cw, ch = 1, 1
_viewport_sizes[tab_id] = (cw, ch)
return cw, ch
if cached_before is not None and cached_before[0] > 0 and cached_before[1] > 0:
result_cw, result_ch = cached_before
else:
result_cw, result_ch = 1, 1
else:
result_cw, result_ch = cw, ch
_viewport_sizes[tab_id] = (cw, ch)
try:
from ..telemetry import write_log
write_log({
"type": "viewport_sample",
"tab_id": tab_id,
"caller": _caller,
"live_w": cw,
"live_h": ch,
"cached_w": cached_before[0] if cached_before else None,
"cached_h": cached_before[1] if cached_before else None,
"deltaH_vs_cache": (
(ch - cached_before[1])
if (cached_before and ch > 0)
else None
),
"returned_w": result_cw,
"returned_h": result_ch,
"evaluate_error": evaluate_error,
})
except Exception:
pass
return result_cw, result_ch
def register_inspection_tools(mcp: FastMCP) -> None:
@@ -475,7 +513,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
return result
rect = result["rect"]
cw, ch = await _ensure_viewport_size(target_tab)
cw, ch = await _ensure_viewport_size(target_tab, _caller="browser_shadow_query")
cw_f = float(cw) if cw > 0 else 1.0
ch_f = float(ch) if ch > 0 else 1.0
return {
@@ -538,7 +576,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
return result
rect = result["rect"]
cw, ch = await _ensure_viewport_size(target_tab)
cw, ch = await _ensure_viewport_size(target_tab, _caller="browser_get_rect")
cw_f = float(cw) if cw > 0 else 1.0
ch_f = float(ch) if ch > 0 else 1.0
return {
+115 -29
View File
@@ -7,11 +7,13 @@ All operations go through the Beeline extension via CDP - no Playwright required
from __future__ import annotations
import asyncio
import json
import logging
import time
from typing import Literal
from fastmcp import FastMCP
from mcp.types import ImageContent, TextContent
from ..bridge import get_bridge
from ..telemetry import log_tool_call
@@ -28,6 +30,57 @@ _AUTO_SNAPSHOT_SETTLE_S = 0.5
AutoSnapshotMode = Literal["default", "simple", "interactive", "off"]
def _text_only(result: dict) -> list:
"""Wrap a dict result as a single-block MCP text response.
Used for early-error returns from coordinate interaction tools that
promise a list shape keeps the result round-trippable through the
MCP transport without a fragile dict-vs-list union.
"""
return [TextContent(type="text", text=json.dumps(result))]
async def _build_visual_response(result: dict, bridge, target_tab: int | None) -> list:
"""Wrap an interaction result and append an annotated post-action screenshot.
Every coordinate-based interaction (click / hover / press_at) goes
through here so the agent ALWAYS sees what the page looks like
immediately after with the click marker overlaid and can
self-correct on a near-miss in the same turn instead of issuing a
separate ``browser_screenshot`` call. The marker comes from
``_interaction_highlights`` which is populated by ``highlight_point``
inside the bridge call, so it's guaranteed to be present here.
Degrades to text-only on any failure (action errored, no tab,
screenshot timed out) never blocks the interaction itself.
"""
text_block = TextContent(type="text", text=json.dumps(result))
if not result.get("ok") or target_tab is None or bridge is None:
return [text_block]
try:
from ..bridge import _interaction_highlights
from .inspection import _resize_and_annotate
shot = await bridge.screenshot(target_tab, full_page=False)
if not shot.get("ok"):
return [text_block]
highlights = (
[_interaction_highlights[target_tab]]
if target_tab in _interaction_highlights
else None
)
data, _ = await asyncio.to_thread(
_resize_and_annotate,
shot["data"],
shot.get("cssWidth", 0),
shot.get("devicePixelRatio", 1.0),
highlights,
)
return [text_block, ImageContent(type="image", data=data, mimeType="image/jpeg")]
except Exception:
return [text_block]
async def _attach_snapshot(result: dict, bridge, target_tab: int, auto_snapshot_mode: str) -> dict:
"""If the interaction succeeded and the caller opted into auto-snapshot,
wait for the page to settle and attach an accessibility snapshot under
@@ -139,7 +192,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
tab_id: int | None = None,
profile: str | None = None,
button: Literal["left", "right", "middle"] = "left",
) -> dict:
) -> list:
"""
Click at a FRACTION of the viewport (0..1, 0..1).
@@ -155,6 +208,22 @@ def register_interaction_tools(mcp: FastMCP) -> None:
tiles, etc.). Proportional positions survive every such
transform; pixel coords do not.
Precision floor: visual coordinate picking from a screenshot
is reliable to roughly **3 % of the viewport** (~2550 CSS px
on a 1280×800 window). The y-axis tends to drift more than x
because vision models perceive vertical centres less
accurately. For targets smaller than that narrow buttons,
checkboxes, dense rows, links look up the rect with
``browser_get_rect`` (selector-based) or ``browser_shadow_query``
(web-component) and pass ``rect.cx`` / ``rect.cy`` directly.
The response is a 2-block list: a JSON text block with the
click result, and a fresh annotated screenshot showing where
the click landed (red marker at the dispatched coord). Use
the screenshot to verify; if the marker is sitting on the
wrong element, retry with the rect-derived centre instead of
re-eyeballing.
Args:
x: X fraction of the viewport (0..1).
y: Y fraction of the viewport (0..1).
@@ -163,9 +232,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
button: Mouse button to click (left, right, middle)
Returns:
Dict with click result, including ``focused_element``
describing what the click focused. ``focused_element.rect``
is also in fractions.
List with two content blocks: TextContent(JSON of the
click result, including ``focused_element`` and its rect
in fractions) and ImageContent(annotated post-click
screenshot). Falls back to a single-block text-only
response on any error.
"""
start = time.perf_counter()
params = {"x": x, "y": y, "tab_id": tab_id, "profile": profile, "button": button}
@@ -174,19 +245,19 @@ def register_interaction_tools(mcp: FastMCP) -> None:
if not bridge or not bridge.is_connected:
result = {"ok": False, "error": "Browser extension not connected"}
log_tool_call("browser_click_coordinate", params, result=result)
return result
return _text_only(result)
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
log_tool_call("browser_click_coordinate", params, result=result)
return result
return _text_only(result)
target_tab = tab_id or ctx.get("activeTabId")
if target_tab is None:
result = {"ok": False, "error": "No active tab"}
log_tool_call("browser_click_coordinate", params, result=result)
return result
return _text_only(result)
# Pixel-input guard: legitimate fractions live in [0, 1]. Allow a
# small overshoot tolerance for edge targets.
@@ -202,12 +273,12 @@ def register_interaction_tools(mcp: FastMCP) -> None:
),
}
log_tool_call("browser_click_coordinate", params, result=result)
return result
return _text_only(result)
try:
from .inspection import _ensure_viewport_size
cw, ch = await _ensure_viewport_size(target_tab)
cw, ch = await _ensure_viewport_size(target_tab, _caller="browser_click_coordinate")
css_x = x * cw
css_y = y * ch
click_result = await bridge.click_coordinate(target_tab, css_x, css_y, button=button)
@@ -217,7 +288,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
result={**click_result, "cssWidth": cw, "cssHeight": ch},
duration_ms=(time.perf_counter() - start) * 1000,
)
return click_result
return await _build_visual_response(click_result, bridge, target_tab)
except Exception as e:
result = {"ok": False, "error": str(e)}
log_tool_call(
@@ -226,7 +297,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
error=e,
duration_ms=(time.perf_counter() - start) * 1000,
)
return result
return _text_only(result)
@mcp.tool()
async def browser_type(
@@ -558,7 +629,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
y: float,
tab_id: int | None = None,
profile: str | None = None,
) -> dict:
) -> list:
"""
Hover at a FRACTION of the viewport (0..1, 0..1).
@@ -567,6 +638,10 @@ def register_interaction_tools(mcp: FastMCP) -> None:
``x`` / ``y`` are fractions of the viewport (``0.5`` = center);
the tool converts to CSS px internally.
Same precision-floor caveat as ``browser_click_coordinate``:
for sub-3 % targets, use rect-derived coords from
``browser_get_rect`` / ``browser_shadow_query``.
Args:
x: X fraction of the viewport (0..1).
y: Y fraction of the viewport (0..1).
@@ -574,7 +649,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
profile: Browser profile name (default: "default")
Returns:
Dict with hover result
List with two content blocks: TextContent(JSON of the
hover result) and ImageContent(annotated post-hover
screenshot showing the cursor marker). Useful for
verifying tooltip / hover-state changes triggered. Falls
back to text-only on error.
"""
start = time.perf_counter()
params = {"x": x, "y": y, "tab_id": tab_id, "profile": profile}
@@ -583,19 +662,19 @@ def register_interaction_tools(mcp: FastMCP) -> None:
if not bridge or not bridge.is_connected:
result = {"ok": False, "error": "Browser extension not connected"}
log_tool_call("browser_hover_coordinate", params, result=result)
return result
return _text_only(result)
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
log_tool_call("browser_hover_coordinate", params, result=result)
return result
return _text_only(result)
target_tab = tab_id or ctx.get("activeTabId")
if target_tab is None:
result = {"ok": False, "error": "No active tab"}
log_tool_call("browser_hover_coordinate", params, result=result)
return result
return _text_only(result)
if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
result = {
@@ -603,12 +682,12 @@ def register_interaction_tools(mcp: FastMCP) -> None:
"error": (f"Coords ({x}, {y}) look like pixels. This tool expects fractions 0..1 of the viewport."),
}
log_tool_call("browser_hover_coordinate", params, result=result)
return result
return _text_only(result)
try:
from .inspection import _ensure_viewport_size
cw, ch = await _ensure_viewport_size(target_tab)
cw, ch = await _ensure_viewport_size(target_tab, _caller="browser_hover_coordinate")
hover_result = await bridge.hover_coordinate(target_tab, x * cw, y * ch)
log_tool_call(
"browser_hover_coordinate",
@@ -616,7 +695,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
result=hover_result,
duration_ms=(time.perf_counter() - start) * 1000,
)
return hover_result
return await _build_visual_response(hover_result, bridge, target_tab)
except Exception as e:
result = {"ok": False, "error": str(e)}
log_tool_call(
@@ -625,7 +704,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
error=e,
duration_ms=(time.perf_counter() - start) * 1000,
)
return result
return _text_only(result)
@mcp.tool()
async def browser_press_at(
@@ -634,7 +713,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
key: str,
tab_id: int | None = None,
profile: str | None = None,
) -> dict:
) -> list:
"""
Move mouse to a FRACTION of the viewport (0..1, 0..1), then press a key.
@@ -644,6 +723,10 @@ def register_interaction_tools(mcp: FastMCP) -> None:
``x`` / ``y`` are fractions of the viewport; the tool converts
to CSS px internally.
Same precision-floor caveat as ``browser_click_coordinate``:
for sub-3 % targets, use rect-derived coords from
``browser_get_rect`` / ``browser_shadow_query``.
Args:
x: X fraction of the viewport (0..1).
y: Y fraction of the viewport (0..1).
@@ -652,7 +735,10 @@ def register_interaction_tools(mcp: FastMCP) -> None:
profile: Browser profile name (default: "default")
Returns:
Dict with press result
List with two content blocks: TextContent(JSON of the
press result) and ImageContent(annotated post-press
screenshot showing where the key was dispatched). Falls
back to text-only on error.
"""
start = time.perf_counter()
params = {"x": x, "y": y, "key": key, "tab_id": tab_id, "profile": profile}
@@ -661,19 +747,19 @@ def register_interaction_tools(mcp: FastMCP) -> None:
if not bridge or not bridge.is_connected:
result = {"ok": False, "error": "Browser extension not connected"}
log_tool_call("browser_press_at", params, result=result)
return result
return _text_only(result)
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
log_tool_call("browser_press_at", params, result=result)
return result
return _text_only(result)
target_tab = tab_id or ctx.get("activeTabId")
if target_tab is None:
result = {"ok": False, "error": "No active tab"}
log_tool_call("browser_press_at", params, result=result)
return result
return _text_only(result)
if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
result = {
@@ -681,12 +767,12 @@ def register_interaction_tools(mcp: FastMCP) -> None:
"error": (f"Coords ({x}, {y}) look like pixels. This tool expects fractions 0..1 of the viewport."),
}
log_tool_call("browser_press_at", params, result=result)
return result
return _text_only(result)
try:
from .inspection import _ensure_viewport_size
cw, ch = await _ensure_viewport_size(target_tab)
cw, ch = await _ensure_viewport_size(target_tab, _caller="browser_press_at")
press_result = await bridge.press_key_at(target_tab, x * cw, y * ch, key)
log_tool_call(
"browser_press_at",
@@ -694,7 +780,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
result=press_result,
duration_ms=(time.perf_counter() - start) * 1000,
)
return press_result
return await _build_visual_response(press_result, bridge, target_tab)
except Exception as e:
result = {"ok": False, "error": str(e)}
log_tool_call(
@@ -703,7 +789,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
error=e,
duration_ms=(time.perf_counter() - start) * 1000,
)
return result
return _text_only(result)
@mcp.tool()
async def browser_select(