feat: browser tools audit and improvements

2026-05-01 13:22:31 -07:00
parent 35bd497750
commit c147364d8c
13 changed files with 165 additions and 199 deletions
@@ -46,9 +46,9 @@ def register_tools(mcp: FastMCP) -> None:
    - Navigation: browser_navigate, browser_go_back, browser_go_forward, browser_reload
    - Inspection: browser_screenshot, browser_snapshot, browser_console
    - Interactions: browser_click, browser_click_coordinate, browser_type, browser_type_focused,
-                    browser_fill, browser_press, browser_hover, browser_select, browser_scroll, browser_drag
+                    browser_press, browser_hover, browser_select, browser_scroll, browser_drag
    - Advanced: browser_wait, browser_evaluate, browser_get_text, browser_get_attribute,
-                  browser_resize, browser_upload, browser_dialog
+                  browser_resize, browser_upload
    """
    register_lifecycle_tools(mcp)
    register_tab_tools(mcp)
@@ -35,16 +35,6 @@ TOOL_SCHEMAS: dict[str, dict] = {
            "use_insert_text": {"type": "boolean", "default": True},
        },
    },
-    "browser_fill": {
-        "description": "Fill an input element (clears existing content first).",
-        "params": {
-            "selector": {"type": "string", "required": True},
-            "value": {"type": "string", "required": True},
-            "tab_id": {"type": "integer"},
-            "profile": {"type": "string"},
-            "timeout_ms": {"type": "integer", "default": 30000},
-        },
-    },
    "browser_type_focused": {
        "description": (
            "Type text into the already-focused element. Use after browser_click_coordinate "
@@ -1,5 +1,5 @@
 """
-Browser advanced tools - wait, evaluate, get_text, get_attribute, resize, dialog.
+Browser advanced tools - wait, evaluate, get_text, get_attribute, resize, upload.

 All operations go through the Beeline extension via CDP - no Playwright required.
 """
@@ -8,7 +8,6 @@ from __future__ import annotations

 import asyncio
 import logging
-from typing import Literal

 from fastmcp import FastMCP

@@ -394,54 +393,3 @@ def register_advanced_tools(mcp: FastMCP) -> None:
            }
        except Exception as e:
            return {"ok": False, "error": str(e)}
-
-    @mcp.tool()
-    async def browser_dialog(
-        action: Literal["accept", "dismiss"] = "accept",
-        prompt_text: str | None = None,
-        tab_id: int | None = None,
-        profile: str | None = None,
-        timeout_ms: int = 30000,
-    ) -> dict:
-        """
-        Handle browser dialogs (alert, confirm, prompt).
-
-        Note: Dialog handling via CDP requires Page.javascriptDialogOpening
-        event handling. This sets up a one-time handler.
-
-        Call BEFORE triggering the action that opens the dialog.
-
-        Args:
-            action: How to handle - "accept" or "dismiss"
-            prompt_text: Text for prompt dialogs (optional)
-            tab_id: Chrome tab ID (default: active tab)
-            profile: Browser profile name (default: "default")
-            timeout_ms: Timeout in ms (default: 30000)
-
-        Returns:
-            Dict with dialog handling result
-        """
-        bridge = get_bridge()
-        if not bridge or not bridge.is_connected:
-            return {"ok": False, "error": "Browser extension not connected"}
-
-        ctx = _get_context(profile)
-        if not ctx:
-            return {"ok": False, "error": "Browser not started"}
-
-        target_tab = tab_id or ctx.get("activeTabId")
-        if target_tab is None:
-            return {"ok": False, "error": "No active tab"}
-
-        try:
-            await bridge.cdp_attach(target_tab)
-            await bridge._cdp(target_tab, "Page.enable")
-
-            return {
-                "ok": True,
-                "action": "handler_set",
-                "message": "Dialog handler prepared.",
-                "suggestion": "Handle dialogs manually or use browser_evaluate.",
-            }
-        except Exception as e:
-            return {"ok": False, "error": str(e)}
@@ -384,48 +384,6 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            log_tool_call("browser_type", params, error=e, duration_ms=(time.perf_counter() - start) * 1000)
            return result

-    @mcp.tool()
-    async def browser_fill(
-        selector: str,
-        value: str,
-        tab_id: int | None = None,
-        profile: str | None = None,
-        timeout_ms: int = 30000,
-        auto_snapshot_mode: AutoSnapshotMode = "simple",
-    ) -> dict:
-        """
-        Fill an input element with a value (clears existing content first).
-
-        Faster than browser_type for filling form fields.
-
-        Args:
-            selector: CSS selector for the input element
-            value: Value to fill
-            tab_id: Chrome tab ID (default: active tab)
-            profile: Browser profile name (default: "default")
-            timeout_ms: Timeout waiting for element (default: 30000)
-            auto_snapshot_mode: Controls the accessibility snapshot taken
-                0.5s after a successful fill. ``"simple"`` (the default)
-                trims unnamed structural nodes; ``"default"`` returns the
-                full tree; ``"interactive"`` returns only controls for the
-                tightest token footprint; ``"off"`` skips the capture —
-                use when batching.
-
-        Returns:
-            Dict with fill result. Includes ``snapshot`` unless
-            ``auto_snapshot_mode="off"`` or the fill failed.
-        """
-        return await browser_type(
-            selector=selector,
-            text=value,
-            tab_id=tab_id,
-            profile=profile,
-            delay_ms=0,
-            clear_first=True,
-            timeout_ms=timeout_ms,
-            auto_snapshot_mode=auto_snapshot_mode,
-        )
-
    @mcp.tool()
    async def browser_type_focused(
        text: str,
@@ -52,6 +52,49 @@ def _clear_profile_tab_caches(ctx: dict[str, Any]) -> None:
    clear_tab_highlights(tab_ids)


+async def _ensure_context(
+    bridge: Any,
+    profile: str | None,
+) -> tuple[str, dict[str, Any], bool]:
+    """Return ``(profile_name, ctx, created)`` for ``profile``.
+
+    Lazy-creates the browser context (tab group + seed tab) the first time
+    a profile is used so URL-taking tools (``browser_open`` /
+    ``browser_navigate``) can be the agent's single cold-start entry
+    point instead of forcing an explicit ``browser_start`` round trip.
+
+    Caller must verify ``bridge`` is connected first; any failure in
+    ``bridge.create_context`` propagates so the caller's existing
+    try/except converts it to an ``{"ok": False, ...}`` result.
+    """
+    profile_name = _resolve_profile(profile)
+    existing = _contexts.get(profile_name)
+    if existing is not None:
+        return profile_name, existing, False
+
+    result = await bridge.create_context(profile_name)
+    group_id = result.get("groupId")
+    tab_id = result.get("tabId")
+
+    ctx: dict[str, Any] = {
+        "groupId": group_id,
+        "activeTabId": tab_id,
+        "_seedTabId": tab_id,  # reused by first browser_open call
+        "tabs": {tab_id} if tab_id is not None else set(),
+    }
+    _contexts[profile_name] = ctx
+
+    logger.info(
+        "Started browser context '%s': groupId=%s, tabId=%s",
+        profile_name,
+        group_id,
+        tab_id,
+    )
+    log_context_event("start", profile_name, group_id=group_id, tab_id=tab_id)
+
+    return profile_name, ctx, True
+
+
 async def shutdown_all_contexts() -> None:
    """Close all active browser contexts. Called at GCU server shutdown."""
    if not _contexts:
@@ -198,16 +241,25 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
    @mcp.tool()
    async def browser_start(profile: str | None = None) -> dict:
        """
-        Start a browser context for the given profile.
+        Explicitly create a browser context (tab group) for ``profile``.

-        Creates a tab group in the user's Chrome via the Beeline extension.
-        No separate browser process is launched - uses the user's existing Chrome.
+        Most workflows do NOT need to call this directly: ``browser_open``
+        and ``browser_navigate`` lazy-create a context on first use, so a
+        single ``browser_open(url)`` covers the cold path. Reach for
+        ``browser_start`` when you want to (a) warm a profile without
+        opening a URL yet, or (b) recreate a context after
+        ``browser_stop`` to clear stale state.
+
+        No separate browser process is launched — uses the user's
+        existing Chrome via the Beeline extension.

        Args:
            profile: Browser profile name (default: "default")

        Returns:
-            Dict with start status including groupId and initial tabId
+            Dict with start status (``"started"`` on fresh creation,
+            ``"already_running"`` when a context for the profile exists),
+            including ``groupId`` and ``activeTabId``.
        """
        start = time.perf_counter()
        params = {"profile": profile}
@@ -221,14 +273,11 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
            log_tool_call("browser_start", params, result=result)
            return result

-        profile_name = _resolve_profile(profile)
-
-        # Check if already running
-        if profile_name in _contexts:
-            ctx = _contexts[profile_name]
+        try:
+            profile_name, ctx, created = await _ensure_context(bridge, profile)
            result = {
                "ok": True,
-                "status": "already_running",
+                "status": "started" if created else "already_running",
                "profile": profile_name,
                "groupId": ctx.get("groupId"),
                "activeTabId": ctx.get("activeTabId"),
@@ -240,42 +289,6 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
                duration_ms=(time.perf_counter() - start) * 1000,
            )
            return result
-
-        try:
-            result = await bridge.create_context(profile_name)
-            group_id = result.get("groupId")
-            tab_id = result.get("tabId")
-
-            _contexts[profile_name] = {
-                "groupId": group_id,
-                "activeTabId": tab_id,
-                "_seedTabId": tab_id,  # reused by first browser_open call
-                "tabs": {tab_id} if tab_id is not None else set(),
-            }
-
-            logger.info(
-                "Started browser context '%s': groupId=%s, tabId=%s",
-                profile_name,
-                group_id,
-                tab_id,
-            )
-
-            log_context_event("start", profile_name, group_id=group_id, tab_id=tab_id)
-
-            result = {
-                "ok": True,
-                "status": "started",
-                "profile": profile_name,
-                "groupId": group_id,
-                "activeTabId": tab_id,
-            }
-            log_tool_call(
-                "browser_start",
-                params,
-                result=result,
-                duration_ms=(time.perf_counter() - start) * 1000,
-            )
-            return result
        except Exception as e:
            logger.exception("Failed to start browser context")
            result = {"ok": False, "error": str(e)}
@@ -14,6 +14,7 @@ from fastmcp import FastMCP

 from ..bridge import get_bridge
 from ..telemetry import log_tool_call
+from .lifecycle import _ensure_context
 from .tabs import _get_context

 logger = logging.getLogger(__name__)
@@ -32,8 +33,14 @@ def register_navigation_tools(mcp: FastMCP) -> None:
        """
        Navigate a tab to a URL.

-        This tool waits for the page to reach the ``wait_until`` condition
-        before returning.
+        Lazy-creates a browser context if none exists (no need to call
+        ``browser_start`` first); when no ``tab_id`` is given and the
+        context was just created, navigation lands on the seed tab.
+        Prefer ``browser_open`` when you specifically want a new tab —
+        ``browser_navigate`` is for redirecting an existing tab.
+
+        Waits for the page to reach the ``wait_until`` condition before
+        returning.

        Args:
            url: URL to navigate to
@@ -54,10 +61,16 @@ def register_navigation_tools(mcp: FastMCP) -> None:
            log_tool_call("browser_navigate", params, result=result)
            return result

-        ctx = _get_context(profile)
-        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
-            log_tool_call("browser_navigate", params, result=result)
+        try:
+            _, ctx, _ = await _ensure_context(bridge, profile)
+        except Exception as e:
+            result = {"ok": False, "error": str(e)}
+            log_tool_call(
+                "browser_navigate",
+                params,
+                error=e,
+                duration_ms=(time.perf_counter() - start) * 1000,
+            )
            return result

        target_tab = tab_id or ctx.get("activeTabId")
@@ -16,7 +16,7 @@ from pydantic import Field
 from ..bridge import get_bridge
 from ..session import _active_profile
 from ..telemetry import log_tool_call
-from .lifecycle import _contexts
+from .lifecycle import _contexts, _ensure_context

 logger = logging.getLogger(__name__)

@@ -98,10 +98,14 @@ def register_tab_tools(mcp: FastMCP) -> None:
        profile: str | None = None,
    ) -> dict:
        """
-        Open a new browser tab and navigate to the given URL.
+        Open a browser tab at the given URL — preferred entry point.

-        The tab is automatically added to the agent's tab group.
-        This tool waits for the page to load before returning.
+        This is the agent's primary "go to a page" tool. If no browser
+        context exists yet for the profile, one is created transparently
+        (no need to call ``browser_start`` first). The first call after
+        a fresh context reuses the seed ``about:blank`` tab; subsequent
+        calls open new tabs in the agent's tab group. Waits for the
+        page to load before returning.

        Args:
            url: URL to navigate to
@@ -120,13 +124,8 @@ def register_tab_tools(mcp: FastMCP) -> None:
            log_tool_call("browser_open", params, result=result)
            return result

-        ctx = _get_context(profile)
-        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
-            log_tool_call("browser_open", params, result=result)
-            return result
-
        try:
+            _, ctx, _ = await _ensure_context(bridge, profile)
            # Reuse the seed about:blank tab from context.create on first open
            seed_tab = ctx.pop("_seedTabId", None)
            if seed_tab is not None: