Merge remote-tracking branch 'origin/fix/browser-behaviour-improvements' into fix/browser-behaviour-improvements

2026-04-16 16:14:43 -07:00
parent 28cad2376c 8222cd306e
commit 9e71f16d15
4 changed files with 149 additions and 100 deletions
@@ -80,6 +80,37 @@ async def _adaptive_poll_sleep(elapsed_s: float) -> None:
 _interaction_highlights: dict[int, dict] = {}


+# Compact descriptor of document.activeElement. Returned by both click()
+# and click_coordinate() so the agent can verify it focused what it
+# intended, then decide whether to follow up with browser_type(text=...,
+# no selector). Keeping this as a single shared string avoids drift
+# between the two click paths.
+_FOCUSED_ELEMENT_JS = """
+(function() {
+    var el = document.activeElement;
+    if (!el || el === document.body) return null;
+    var rect = el.getBoundingClientRect();
+    var attrs = {};
+    for (var i = 0; i < el.attributes.length && i < 10; i++) {
+        attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
+    }
+    return {
+        tag: el.tagName.toLowerCase(),
+        id: el.id || null,
+        className: el.className || null,
+        name: el.getAttribute('name') || null,
+        type: el.getAttribute('type') || null,
+        role: el.getAttribute('role') || null,
+        contenteditable: el.getAttribute('contenteditable') || null,
+        text: (el.innerText || '').substring(0, 200),
+        value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
+        attributes: attrs,
+        rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
+    };
+})()
+"""
+
+
 def _get_active_profile() -> str:
    """Get the current active profile from context variable."""
    try:
@@ -763,7 +794,8 @@ class BeelineBridge:
                rx = value.get("x", 0) - value.get("width", 0) / 2
                ry = value.get("y", 0) - value.get("height", 0) / 2
                await self.highlight_rect(tab_id, rx, ry, value.get("width", 0), value.get("height", 0), label=selector)
-                return {
+                focused_info = await self._read_focused_element(tab_id)
+                resp = {
                    "ok": True,
                    "action": "click",
                    "selector": selector,
@@ -771,6 +803,9 @@ class BeelineBridge:
                    "y": value.get("y", 0),
                    "method": "javascript",
                }
+                if focused_info:
+                    resp["focused_element"] = focused_info
+                return resp

            # If JavaScript click failed, try CDP approach
            if isinstance(value, dict) and value.get("error"):
@@ -883,7 +918,8 @@ class BeelineBridge:
            w = bounds_value.get("width", 0)
            h = bounds_value.get("height", 0)
            await self.highlight_rect(tab_id, x - w / 2, y - h / 2, w, h, label=selector)
-            return {
+            focused_info = await self._read_focused_element(tab_id)
+            resp = {
                "ok": True,
                "action": "click",
                "selector": selector,
@@ -891,10 +927,29 @@ class BeelineBridge:
                "y": y,
                "method": "cdp",
            }
+            if focused_info:
+                resp["focused_element"] = focused_info
+            return resp

        except Exception as e:
            return {"ok": False, "error": f"Click failed: {e}"}

+    async def _read_focused_element(self, tab_id: int) -> dict | None:
+        """Read document.activeElement and return a compact descriptor.
+
+        Returns None on any failure — never raises. Used by both click
+        paths (selector-based click() and click_coordinate()) so the
+        agent gets the same response shape regardless of which one was
+        called. The descriptor lets the agent answer "did my click land
+        on an editable?" without a second round-trip.
+        """
+        try:
+            await self._try_enable_domain(tab_id, "Runtime")
+            result = await self.evaluate(tab_id, _FOCUSED_ELEMENT_JS)
+            return (result or {}).get("result")
+        except Exception:
+            return None
+
    async def click_coordinate(self, tab_id: int, x: float, y: float, button: str = "left") -> dict:
        """Click at specific coordinates."""
        await self.cdp_attach(tab_id)
@@ -931,40 +986,7 @@ class BeelineBridge:

        await self.highlight_point(tab_id, x, y, label=f"click ({x},{y})")

-        # Query the focused element after the click
-        focused_info = None
-        try:
-            await self._try_enable_domain(tab_id, "Runtime")
-            result = await self.evaluate(
-                tab_id,
-                """
-                (function() {
-                    var el = document.activeElement;
-                    if (!el || el === document.body) return null;
-                    var rect = el.getBoundingClientRect();
-                    var attrs = {};
-                    for (var i = 0; i < el.attributes.length && i < 10; i++) {
-                        attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
-                    }
-                    return {
-                        tag: el.tagName.toLowerCase(),
-                        id: el.id || null,
-                        className: el.className || null,
-                        name: el.getAttribute('name') || null,
-                        type: el.getAttribute('type') || null,
-                        role: el.getAttribute('role') || null,
-                        text: (el.innerText || '').substring(0, 200),
-                        value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
-                        attributes: attrs,
-                        rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
-                    };
-                })()
-                """,
-            )
-            focused_info = (result or {}).get("result")
-        except Exception:
-            pass
-
+        focused_info = await self._read_focused_element(tab_id)
        resp = {"ok": True, "action": "click_coordinate", "x": x, "y": y}
        if focused_info:
            resp["focused_element"] = focused_info
@@ -185,37 +185,39 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        use_insert_text: bool = True,
    ) -> dict:
        """
-        Type text into an input element.
+        Type text into an element identified by CSS selector.

-        Automatically routes through a real CDP pointer click on the
-        element before inserting text — so that rich-text editors like
-        Lexical (Gmail, LinkedIn DMs), Draft.js (X compose), and
-        ProseMirror (Reddit) see a native focus event and enable their
-        submit buttons. See the gcu-browser skill for the full "click-
-        then-type" pattern.
+        Performs a CDP click on the selector first (to focus it), then
+        inserts text via CDP Input.insertText. This handles plain
+        <input>/<textarea> as well as rich-text editors (Lexical,
+        Draft.js, ProseMirror) that require a real pointer-sourced
+        focus event.

-        By default uses CDP Input.insertText which is the most reliable
-        way to insert text into rich editors. Set
-        ``use_insert_text=False`` to fall back to per-character
-        keyDown/keyUp events (needed only for code editors that fire
-        on specific keystrokes, or when ``delay_ms`` typing animation
-        is required).
+        For the click-coordinate-then-type pattern (shadow DOM, iframes,
+        or when you don't have a reliable selector), use
+        ``browser_type_focused`` instead — it types into
+        document.activeElement without needing a selector.
+
+        By default uses CDP Input.insertText (``use_insert_text=True``).
+        Set ``use_insert_text=False`` only for code editors that watch
+        specific keystrokes, or when ``delay_ms`` typing animation is
+        required.

        Args:
-            selector: CSS selector for the input element
-            text: Text to type
-            tab_id: Chrome tab ID (default: active tab)
-            profile: Browser profile name (default: "default")
+            selector: CSS selector for the input element.
+            text: Text to type.
+            tab_id: Chrome tab ID (default: active tab).
+            profile: Browser profile name (default: "default").
            delay_ms: Delay between keystrokes in ms (default: 0).
-                      Forces the per-keystroke fallback when > 0.
-            clear_first: Clear existing text before typing (default: True)
-            timeout_ms: Timeout waiting for element (default: 30000)
+                Forces the per-keystroke fallback when > 0.
+            clear_first: Clear existing text before typing (default: True).
+            timeout_ms: Timeout waiting for element (default: 30000).
            use_insert_text: Use CDP Input.insertText (default: True) for
-                             reliable insertion into rich-text editors.
-                             Set False for per-keystroke dispatch.
+                reliable insertion into rich-text editors. Set False for
+                per-keystroke dispatch.

        Returns:
-            Dict with type result
+            Dict with type result.
        """
        start = time.perf_counter()
        params = {"selector": selector, "text": text, "tab_id": tab_id, "profile": profile}
@@ -303,24 +305,35 @@ def register_interaction_tools(mcp: FastMCP) -> None:
        use_insert_text: bool = True,
    ) -> dict:
        """
-        Type text into the already-focused element.
+        Type text into the already-focused element (document.activeElement).

-        Use after browser_click_coordinate (or browser_click) has
-        focused the target element. Inserts text via CDP
-        Input.insertText by default — much faster than repeated
-        browser_press calls for multi-character input.
+        CANONICAL PATTERN:
+            browser_click_coordinate(x, y)   # focus the target
+            browser_type_focused(text="...")  # type into it
+
+        CDP's Input.insertText takes no target parameter — it operates
+        implicitly on the focused editable. This makes it shadow-agnostic
+        and iframe-agnostic, and the ONLY reliable way to type into:
+          - LinkedIn's #interop-outlet Lexical composer
+          - X/Twitter's Draft.js compose box
+          - Reddit's ProseMirror comment box
+          - Any site wrapped in Trusted Types CSP
+          - Any nested-iframe message overlay
+
+        Much faster than repeated browser_press calls for multi-character
+        input.

        Args:
-            text: Text to type
-            tab_id: Chrome tab ID (default: active tab)
-            profile: Browser profile name (default: "default")
+            text: Text to insert at the current cursor position.
+            tab_id: Chrome tab ID (default: active tab).
+            profile: Browser profile name (default: "default").
            delay_ms: Delay between keystrokes in ms (default: 0).
                      Forces per-keystroke dispatch when > 0.
-            clear_first: Clear existing text before typing (default: True)
-            use_insert_text: Use CDP Input.insertText (default: True)
+            clear_first: Clear existing text before typing (default: True).
+            use_insert_text: Use CDP Input.insertText (default: True).

        Returns:
-            Dict with type result
+            Dict with type result.
        """
        start = time.perf_counter()
        params = {"text": text, "tab_id": tab_id, "profile": profile}