feat: browser tools audit and improvements

This commit is contained in:
Richard Tang
2026-05-01 13:22:31 -07:00
parent 35bd497750
commit c147364d8c
13 changed files with 165 additions and 199 deletions
+2 -2
View File
@@ -46,9 +46,9 @@ def register_tools(mcp: FastMCP) -> None:
- Navigation: browser_navigate, browser_go_back, browser_go_forward, browser_reload
- Inspection: browser_screenshot, browser_snapshot, browser_console
- Interactions: browser_click, browser_click_coordinate, browser_type, browser_type_focused,
browser_fill, browser_press, browser_hover, browser_select, browser_scroll, browser_drag
browser_press, browser_hover, browser_select, browser_scroll, browser_drag
- Advanced: browser_wait, browser_evaluate, browser_get_text, browser_get_attribute,
browser_resize, browser_upload, browser_dialog
browser_resize, browser_upload
"""
register_lifecycle_tools(mcp)
register_tab_tools(mcp)
-10
View File
@@ -35,16 +35,6 @@ TOOL_SCHEMAS: dict[str, dict] = {
"use_insert_text": {"type": "boolean", "default": True},
},
},
"browser_fill": {
"description": "Fill an input element (clears existing content first).",
"params": {
"selector": {"type": "string", "required": True},
"value": {"type": "string", "required": True},
"tab_id": {"type": "integer"},
"profile": {"type": "string"},
"timeout_ms": {"type": "integer", "default": 30000},
},
},
"browser_type_focused": {
"description": (
"Type text into the already-focused element. Use after browser_click_coordinate "
+1 -53
View File
@@ -1,5 +1,5 @@
"""
Browser advanced tools - wait, evaluate, get_text, get_attribute, resize, dialog.
Browser advanced tools - wait, evaluate, get_text, get_attribute, resize, upload.
All operations go through the Beeline extension via CDP - no Playwright required.
"""
@@ -8,7 +8,6 @@ from __future__ import annotations
import asyncio
import logging
from typing import Literal
from fastmcp import FastMCP
@@ -394,54 +393,3 @@ def register_advanced_tools(mcp: FastMCP) -> None:
}
except Exception as e:
return {"ok": False, "error": str(e)}
@mcp.tool()
async def browser_dialog(
action: Literal["accept", "dismiss"] = "accept",
prompt_text: str | None = None,
tab_id: int | None = None,
profile: str | None = None,
timeout_ms: int = 30000,
) -> dict:
"""
Handle browser dialogs (alert, confirm, prompt).
Note: Dialog handling via CDP requires Page.javascriptDialogOpening
event handling. This sets up a one-time handler.
Call BEFORE triggering the action that opens the dialog.
Args:
action: How to handle - "accept" or "dismiss"
prompt_text: Text for prompt dialogs (optional)
tab_id: Chrome tab ID (default: active tab)
profile: Browser profile name (default: "default")
timeout_ms: Timeout in ms (default: 30000)
Returns:
Dict with dialog handling result
"""
bridge = get_bridge()
if not bridge or not bridge.is_connected:
return {"ok": False, "error": "Browser extension not connected"}
ctx = _get_context(profile)
if not ctx:
return {"ok": False, "error": "Browser not started"}
target_tab = tab_id or ctx.get("activeTabId")
if target_tab is None:
return {"ok": False, "error": "No active tab"}
try:
await bridge.cdp_attach(target_tab)
await bridge._cdp(target_tab, "Page.enable")
return {
"ok": True,
"action": "handler_set",
"message": "Dialog handler prepared.",
"suggestion": "Handle dialogs manually or use browser_evaluate.",
}
except Exception as e:
return {"ok": False, "error": str(e)}
@@ -384,48 +384,6 @@ def register_interaction_tools(mcp: FastMCP) -> None:
log_tool_call("browser_type", params, error=e, duration_ms=(time.perf_counter() - start) * 1000)
return result
@mcp.tool()
async def browser_fill(
selector: str,
value: str,
tab_id: int | None = None,
profile: str | None = None,
timeout_ms: int = 30000,
auto_snapshot_mode: AutoSnapshotMode = "simple",
) -> dict:
"""
Fill an input element with a value (clears existing content first).
Faster than browser_type for filling form fields.
Args:
selector: CSS selector for the input element
value: Value to fill
tab_id: Chrome tab ID (default: active tab)
profile: Browser profile name (default: "default")
timeout_ms: Timeout waiting for element (default: 30000)
auto_snapshot_mode: Controls the accessibility snapshot taken
0.5s after a successful fill. ``"simple"`` (the default)
trims unnamed structural nodes; ``"default"`` returns the
full tree; ``"interactive"`` returns only controls for the
tightest token footprint; ``"off"`` skips the capture
use when batching.
Returns:
Dict with fill result. Includes ``snapshot`` unless
``auto_snapshot_mode="off"`` or the fill failed.
"""
return await browser_type(
selector=selector,
text=value,
tab_id=tab_id,
profile=profile,
delay_ms=0,
clear_first=True,
timeout_ms=timeout_ms,
auto_snapshot_mode=auto_snapshot_mode,
)
@mcp.tool()
async def browser_type_focused(
text: str,
+59 -46
View File
@@ -52,6 +52,49 @@ def _clear_profile_tab_caches(ctx: dict[str, Any]) -> None:
clear_tab_highlights(tab_ids)
async def _ensure_context(
bridge: Any,
profile: str | None,
) -> tuple[str, dict[str, Any], bool]:
"""Return ``(profile_name, ctx, created)`` for ``profile``.
Lazy-creates the browser context (tab group + seed tab) the first time
a profile is used so URL-taking tools (``browser_open`` /
``browser_navigate``) can be the agent's single cold-start entry
point instead of forcing an explicit ``browser_start`` round trip.
Caller must verify ``bridge`` is connected first; any failure in
``bridge.create_context`` propagates so the caller's existing
try/except converts it to an ``{"ok": False, ...}`` result.
"""
profile_name = _resolve_profile(profile)
existing = _contexts.get(profile_name)
if existing is not None:
return profile_name, existing, False
result = await bridge.create_context(profile_name)
group_id = result.get("groupId")
tab_id = result.get("tabId")
ctx: dict[str, Any] = {
"groupId": group_id,
"activeTabId": tab_id,
"_seedTabId": tab_id, # reused by first browser_open call
"tabs": {tab_id} if tab_id is not None else set(),
}
_contexts[profile_name] = ctx
logger.info(
"Started browser context '%s': groupId=%s, tabId=%s",
profile_name,
group_id,
tab_id,
)
log_context_event("start", profile_name, group_id=group_id, tab_id=tab_id)
return profile_name, ctx, True
async def shutdown_all_contexts() -> None:
"""Close all active browser contexts. Called at GCU server shutdown."""
if not _contexts:
@@ -198,16 +241,25 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
@mcp.tool()
async def browser_start(profile: str | None = None) -> dict:
"""
Start a browser context for the given profile.
Explicitly create a browser context (tab group) for ``profile``.
Creates a tab group in the user's Chrome via the Beeline extension.
No separate browser process is launched - uses the user's existing Chrome.
Most workflows do NOT need to call this directly: ``browser_open``
and ``browser_navigate`` lazy-create a context on first use, so a
single ``browser_open(url)`` covers the cold path. Reach for
``browser_start`` when you want to (a) warm a profile without
opening a URL yet, or (b) recreate a context after
``browser_stop`` to clear stale state.
No separate browser process is launched uses the user's
existing Chrome via the Beeline extension.
Args:
profile: Browser profile name (default: "default")
Returns:
Dict with start status including groupId and initial tabId
Dict with start status (``"started"`` on fresh creation,
``"already_running"`` when a context for the profile exists),
including ``groupId`` and ``activeTabId``.
"""
start = time.perf_counter()
params = {"profile": profile}
@@ -221,14 +273,11 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
log_tool_call("browser_start", params, result=result)
return result
profile_name = _resolve_profile(profile)
# Check if already running
if profile_name in _contexts:
ctx = _contexts[profile_name]
try:
profile_name, ctx, created = await _ensure_context(bridge, profile)
result = {
"ok": True,
"status": "already_running",
"status": "started" if created else "already_running",
"profile": profile_name,
"groupId": ctx.get("groupId"),
"activeTabId": ctx.get("activeTabId"),
@@ -240,42 +289,6 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
duration_ms=(time.perf_counter() - start) * 1000,
)
return result
try:
result = await bridge.create_context(profile_name)
group_id = result.get("groupId")
tab_id = result.get("tabId")
_contexts[profile_name] = {
"groupId": group_id,
"activeTabId": tab_id,
"_seedTabId": tab_id, # reused by first browser_open call
"tabs": {tab_id} if tab_id is not None else set(),
}
logger.info(
"Started browser context '%s': groupId=%s, tabId=%s",
profile_name,
group_id,
tab_id,
)
log_context_event("start", profile_name, group_id=group_id, tab_id=tab_id)
result = {
"ok": True,
"status": "started",
"profile": profile_name,
"groupId": group_id,
"activeTabId": tab_id,
}
log_tool_call(
"browser_start",
params,
result=result,
duration_ms=(time.perf_counter() - start) * 1000,
)
return result
except Exception as e:
logger.exception("Failed to start browser context")
result = {"ok": False, "error": str(e)}
+19 -6
View File
@@ -14,6 +14,7 @@ from fastmcp import FastMCP
from ..bridge import get_bridge
from ..telemetry import log_tool_call
from .lifecycle import _ensure_context
from .tabs import _get_context
logger = logging.getLogger(__name__)
@@ -32,8 +33,14 @@ def register_navigation_tools(mcp: FastMCP) -> None:
"""
Navigate a tab to a URL.
This tool waits for the page to reach the ``wait_until`` condition
before returning.
Lazy-creates a browser context if none exists (no need to call
``browser_start`` first); when no ``tab_id`` is given and the
context was just created, navigation lands on the seed tab.
Prefer ``browser_open`` when you specifically want a new tab
``browser_navigate`` is for redirecting an existing tab.
Waits for the page to reach the ``wait_until`` condition before
returning.
Args:
url: URL to navigate to
@@ -54,10 +61,16 @@ def register_navigation_tools(mcp: FastMCP) -> None:
log_tool_call("browser_navigate", params, result=result)
return result
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
log_tool_call("browser_navigate", params, result=result)
try:
_, ctx, _ = await _ensure_context(bridge, profile)
except Exception as e:
result = {"ok": False, "error": str(e)}
log_tool_call(
"browser_navigate",
params,
error=e,
duration_ms=(time.perf_counter() - start) * 1000,
)
return result
target_tab = tab_id or ctx.get("activeTabId")
+9 -10
View File
@@ -16,7 +16,7 @@ from pydantic import Field
from ..bridge import get_bridge
from ..session import _active_profile
from ..telemetry import log_tool_call
from .lifecycle import _contexts
from .lifecycle import _contexts, _ensure_context
logger = logging.getLogger(__name__)
@@ -98,10 +98,14 @@ def register_tab_tools(mcp: FastMCP) -> None:
profile: str | None = None,
) -> dict:
"""
Open a new browser tab and navigate to the given URL.
Open a browser tab at the given URL preferred entry point.
The tab is automatically added to the agent's tab group.
This tool waits for the page to load before returning.
This is the agent's primary "go to a page" tool. If no browser
context exists yet for the profile, one is created transparently
(no need to call ``browser_start`` first). The first call after
a fresh context reuses the seed ``about:blank`` tab; subsequent
calls open new tabs in the agent's tab group. Waits for the
page to load before returning.
Args:
url: URL to navigate to
@@ -120,13 +124,8 @@ def register_tab_tools(mcp: FastMCP) -> None:
log_tool_call("browser_open", params, result=result)
return result
ctx = _get_context(profile)
if not ctx:
result = {"ok": False, "error": "Browser not started. Call browser_start first."}
log_tool_call("browser_open", params, result=result)
return result
try:
_, ctx, _ = await _ensure_context(bridge, profile)
# Reuse the seed about:blank tab from context.create on first open
seed_tab = ctx.pop("_seedTabId", None)
if seed_tab is not None: