Merge remote-tracking branch 'origin/fix/browser-behaviour-improvements' into fix/browser-behaviour-improvements
This commit is contained in:
@@ -80,6 +80,37 @@ async def _adaptive_poll_sleep(elapsed_s: float) -> None:
|
||||
_interaction_highlights: dict[int, dict] = {}
|
||||
|
||||
|
||||
# Compact descriptor of document.activeElement. Returned by both click()
|
||||
# and click_coordinate() so the agent can verify it focused what it
|
||||
# intended, then decide whether to follow up with browser_type(text=...,
|
||||
# no selector). Keeping this as a single shared string avoids drift
|
||||
# between the two click paths.
|
||||
_FOCUSED_ELEMENT_JS = """
|
||||
(function() {
|
||||
var el = document.activeElement;
|
||||
if (!el || el === document.body) return null;
|
||||
var rect = el.getBoundingClientRect();
|
||||
var attrs = {};
|
||||
for (var i = 0; i < el.attributes.length && i < 10; i++) {
|
||||
attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
|
||||
}
|
||||
return {
|
||||
tag: el.tagName.toLowerCase(),
|
||||
id: el.id || null,
|
||||
className: el.className || null,
|
||||
name: el.getAttribute('name') || null,
|
||||
type: el.getAttribute('type') || null,
|
||||
role: el.getAttribute('role') || null,
|
||||
contenteditable: el.getAttribute('contenteditable') || null,
|
||||
text: (el.innerText || '').substring(0, 200),
|
||||
value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
|
||||
attributes: attrs,
|
||||
rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
|
||||
};
|
||||
})()
|
||||
"""
|
||||
|
||||
|
||||
def _get_active_profile() -> str:
|
||||
"""Get the current active profile from context variable."""
|
||||
try:
|
||||
@@ -763,7 +794,8 @@ class BeelineBridge:
|
||||
rx = value.get("x", 0) - value.get("width", 0) / 2
|
||||
ry = value.get("y", 0) - value.get("height", 0) / 2
|
||||
await self.highlight_rect(tab_id, rx, ry, value.get("width", 0), value.get("height", 0), label=selector)
|
||||
return {
|
||||
focused_info = await self._read_focused_element(tab_id)
|
||||
resp = {
|
||||
"ok": True,
|
||||
"action": "click",
|
||||
"selector": selector,
|
||||
@@ -771,6 +803,9 @@ class BeelineBridge:
|
||||
"y": value.get("y", 0),
|
||||
"method": "javascript",
|
||||
}
|
||||
if focused_info:
|
||||
resp["focused_element"] = focused_info
|
||||
return resp
|
||||
|
||||
# If JavaScript click failed, try CDP approach
|
||||
if isinstance(value, dict) and value.get("error"):
|
||||
@@ -883,7 +918,8 @@ class BeelineBridge:
|
||||
w = bounds_value.get("width", 0)
|
||||
h = bounds_value.get("height", 0)
|
||||
await self.highlight_rect(tab_id, x - w / 2, y - h / 2, w, h, label=selector)
|
||||
return {
|
||||
focused_info = await self._read_focused_element(tab_id)
|
||||
resp = {
|
||||
"ok": True,
|
||||
"action": "click",
|
||||
"selector": selector,
|
||||
@@ -891,10 +927,29 @@ class BeelineBridge:
|
||||
"y": y,
|
||||
"method": "cdp",
|
||||
}
|
||||
if focused_info:
|
||||
resp["focused_element"] = focused_info
|
||||
return resp
|
||||
|
||||
except Exception as e:
|
||||
return {"ok": False, "error": f"Click failed: {e}"}
|
||||
|
||||
async def _read_focused_element(self, tab_id: int) -> dict | None:
|
||||
"""Read document.activeElement and return a compact descriptor.
|
||||
|
||||
Returns None on any failure — never raises. Used by both click
|
||||
paths (selector-based click() and click_coordinate()) so the
|
||||
agent gets the same response shape regardless of which one was
|
||||
called. The descriptor lets the agent answer "did my click land
|
||||
on an editable?" without a second round-trip.
|
||||
"""
|
||||
try:
|
||||
await self._try_enable_domain(tab_id, "Runtime")
|
||||
result = await self.evaluate(tab_id, _FOCUSED_ELEMENT_JS)
|
||||
return (result or {}).get("result")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
async def click_coordinate(self, tab_id: int, x: float, y: float, button: str = "left") -> dict:
|
||||
"""Click at specific coordinates."""
|
||||
await self.cdp_attach(tab_id)
|
||||
@@ -931,40 +986,7 @@ class BeelineBridge:
|
||||
|
||||
await self.highlight_point(tab_id, x, y, label=f"click ({x},{y})")
|
||||
|
||||
# Query the focused element after the click
|
||||
focused_info = None
|
||||
try:
|
||||
await self._try_enable_domain(tab_id, "Runtime")
|
||||
result = await self.evaluate(
|
||||
tab_id,
|
||||
"""
|
||||
(function() {
|
||||
var el = document.activeElement;
|
||||
if (!el || el === document.body) return null;
|
||||
var rect = el.getBoundingClientRect();
|
||||
var attrs = {};
|
||||
for (var i = 0; i < el.attributes.length && i < 10; i++) {
|
||||
attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
|
||||
}
|
||||
return {
|
||||
tag: el.tagName.toLowerCase(),
|
||||
id: el.id || null,
|
||||
className: el.className || null,
|
||||
name: el.getAttribute('name') || null,
|
||||
type: el.getAttribute('type') || null,
|
||||
role: el.getAttribute('role') || null,
|
||||
text: (el.innerText || '').substring(0, 200),
|
||||
value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
|
||||
attributes: attrs,
|
||||
rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
|
||||
};
|
||||
})()
|
||||
""",
|
||||
)
|
||||
focused_info = (result or {}).get("result")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
focused_info = await self._read_focused_element(tab_id)
|
||||
resp = {"ok": True, "action": "click_coordinate", "x": x, "y": y}
|
||||
if focused_info:
|
||||
resp["focused_element"] = focused_info
|
||||
|
||||
@@ -185,37 +185,39 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
use_insert_text: bool = True,
|
||||
) -> dict:
|
||||
"""
|
||||
Type text into an input element.
|
||||
Type text into an element identified by CSS selector.
|
||||
|
||||
Automatically routes through a real CDP pointer click on the
|
||||
element before inserting text — so that rich-text editors like
|
||||
Lexical (Gmail, LinkedIn DMs), Draft.js (X compose), and
|
||||
ProseMirror (Reddit) see a native focus event and enable their
|
||||
submit buttons. See the gcu-browser skill for the full "click-
|
||||
then-type" pattern.
|
||||
Performs a CDP click on the selector first (to focus it), then
|
||||
inserts text via CDP Input.insertText. This handles plain
|
||||
<input>/<textarea> as well as rich-text editors (Lexical,
|
||||
Draft.js, ProseMirror) that require a real pointer-sourced
|
||||
focus event.
|
||||
|
||||
By default uses CDP Input.insertText which is the most reliable
|
||||
way to insert text into rich editors. Set
|
||||
``use_insert_text=False`` to fall back to per-character
|
||||
keyDown/keyUp events (needed only for code editors that fire
|
||||
on specific keystrokes, or when ``delay_ms`` typing animation
|
||||
is required).
|
||||
For the click-coordinate-then-type pattern (shadow DOM, iframes,
|
||||
or when you don't have a reliable selector), use
|
||||
``browser_type_focused`` instead — it types into
|
||||
document.activeElement without needing a selector.
|
||||
|
||||
By default uses CDP Input.insertText (``use_insert_text=True``).
|
||||
Set ``use_insert_text=False`` only for code editors that watch
|
||||
specific keystrokes, or when ``delay_ms`` typing animation is
|
||||
required.
|
||||
|
||||
Args:
|
||||
selector: CSS selector for the input element
|
||||
text: Text to type
|
||||
tab_id: Chrome tab ID (default: active tab)
|
||||
profile: Browser profile name (default: "default")
|
||||
selector: CSS selector for the input element.
|
||||
text: Text to type.
|
||||
tab_id: Chrome tab ID (default: active tab).
|
||||
profile: Browser profile name (default: "default").
|
||||
delay_ms: Delay between keystrokes in ms (default: 0).
|
||||
Forces the per-keystroke fallback when > 0.
|
||||
clear_first: Clear existing text before typing (default: True)
|
||||
timeout_ms: Timeout waiting for element (default: 30000)
|
||||
Forces the per-keystroke fallback when > 0.
|
||||
clear_first: Clear existing text before typing (default: True).
|
||||
timeout_ms: Timeout waiting for element (default: 30000).
|
||||
use_insert_text: Use CDP Input.insertText (default: True) for
|
||||
reliable insertion into rich-text editors.
|
||||
Set False for per-keystroke dispatch.
|
||||
reliable insertion into rich-text editors. Set False for
|
||||
per-keystroke dispatch.
|
||||
|
||||
Returns:
|
||||
Dict with type result
|
||||
Dict with type result.
|
||||
"""
|
||||
start = time.perf_counter()
|
||||
params = {"selector": selector, "text": text, "tab_id": tab_id, "profile": profile}
|
||||
@@ -303,24 +305,35 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
use_insert_text: bool = True,
|
||||
) -> dict:
|
||||
"""
|
||||
Type text into the already-focused element.
|
||||
Type text into the already-focused element (document.activeElement).
|
||||
|
||||
Use after browser_click_coordinate (or browser_click) has
|
||||
focused the target element. Inserts text via CDP
|
||||
Input.insertText by default — much faster than repeated
|
||||
browser_press calls for multi-character input.
|
||||
CANONICAL PATTERN:
|
||||
browser_click_coordinate(x, y) # focus the target
|
||||
browser_type_focused(text="...") # type into it
|
||||
|
||||
CDP's Input.insertText takes no target parameter — it operates
|
||||
implicitly on the focused editable. This makes it shadow-agnostic
|
||||
and iframe-agnostic, and the ONLY reliable way to type into:
|
||||
- LinkedIn's #interop-outlet Lexical composer
|
||||
- X/Twitter's Draft.js compose box
|
||||
- Reddit's ProseMirror comment box
|
||||
- Any site wrapped in Trusted Types CSP
|
||||
- Any nested-iframe message overlay
|
||||
|
||||
Much faster than repeated browser_press calls for multi-character
|
||||
input.
|
||||
|
||||
Args:
|
||||
text: Text to type
|
||||
tab_id: Chrome tab ID (default: active tab)
|
||||
profile: Browser profile name (default: "default")
|
||||
text: Text to insert at the current cursor position.
|
||||
tab_id: Chrome tab ID (default: active tab).
|
||||
profile: Browser profile name (default: "default").
|
||||
delay_ms: Delay between keystrokes in ms (default: 0).
|
||||
Forces per-keystroke dispatch when > 0.
|
||||
clear_first: Clear existing text before typing (default: True)
|
||||
use_insert_text: Use CDP Input.insertText (default: True)
|
||||
clear_first: Clear existing text before typing (default: True).
|
||||
use_insert_text: Use CDP Input.insertText (default: True).
|
||||
|
||||
Returns:
|
||||
Dict with type result
|
||||
Dict with type result.
|
||||
"""
|
||||
start = time.perf_counter()
|
||||
params = {"text": text, "tab_id": tab_id, "profile": profile}
|
||||
|
||||
Reference in New Issue
Block a user