From c147364d8c76e941c138720abb0467dc468b14d1 Mon Sep 17 00:00:00 2001 From: Richard Tang Date: Fri, 1 May 2026 13:22:31 -0700 Subject: [PATCH] feat: browser tools audit and improvements --- core/framework/agents/queen/nodes/__init__.py | 8 +- .../agents/queen/queen_tools_defaults.py | 85 ++++++++++---- .../agents/queen/reference/gcu_guide.md | 4 +- core/framework/orchestrator/gcu.py | 2 +- .../browser-automation/SKILL.md | 2 +- scripts/browser_remote_ui.html | 4 +- tools/src/gcu/browser/__init__.py | 4 +- tools/src/gcu/browser/bridge_tools.py | 10 -- tools/src/gcu/browser/tools/advanced.py | 54 +-------- tools/src/gcu/browser/tools/interactions.py | 42 ------- tools/src/gcu/browser/tools/lifecycle.py | 105 ++++++++++-------- tools/src/gcu/browser/tools/navigation.py | 25 ++++- tools/src/gcu/browser/tools/tabs.py | 19 ++-- 13 files changed, 165 insertions(+), 199 deletions(-) diff --git a/core/framework/agents/queen/nodes/__init__.py b/core/framework/agents/queen/nodes/__init__.py index 789d7692..aad7ddcf 100644 --- a/core/framework/agents/queen/nodes/__init__.py +++ b/core/framework/agents/queen/nodes/__init__.py @@ -246,9 +246,11 @@ search inside files, target='files' (with a glob like '*.py') to list \ or find files. Mtime-sorted in files mode. ## Browser Automation (gcu-tools MCP) -- Use `browser_*` tools (browser_start, browser_navigate, browser_click, \ - browser_fill, browser_snapshot, browser_screenshot, browser_scroll, \ - browser_tabs, browser_close, browser_evaluate, etc.). +- Use `browser_*` tools — `browser_open(url)` is the cold-start entry point \ + (lazy-creates the context; no `browser_start` first). Then `browser_navigate`, \ + `browser_click`, `browser_type`, `browser_snapshot`, \ + `browser_screenshot`, `browser_scroll`, \ + `browser_tabs`, `browser_close`, `browser_evaluate`, etc. - MUST Follow the browser-automation skill protocol before using browser tools. ## Hand off to a colony diff --git a/core/framework/agents/queen/queen_tools_defaults.py b/core/framework/agents/queen/queen_tools_defaults.py index dbc832b5..21de2af9 100644 --- a/core/framework/agents/queen/queen_tools_defaults.py +++ b/core/framework/agents/queen/queen_tools_defaults.py @@ -36,20 +36,39 @@ logger = logging.getLogger(__name__) # the named entries only). _TOOL_CATEGORIES: dict[str, list[str]] = { - # Unified file ops — read, write, edit, search across the post-refactor - # files-tools MCP server (read_file, write_file, edit_file, hashline_edit, - # apply_patch, search_files). + # Unified file ops — read, write, edit, search across the files-tools + # MCP server (read_file, write_file, edit_file, search_files). pdf_read + # lives in hive_tools so it's listed explicitly; without it queens + # cannot read PDF documents by default. "file_ops": [ "@server:files-tools", + "pdf_read", ], - # Terminal + process control — engineering personas only. - # The terminal-tools MCP server covers foreground exec with auto-promotion, - # background jobs, persistent PTY sessions, and ripgrep/find search. - "terminal": [ - "@server:terminal-tools", + # Terminal basic — the 3-tool subset queens get out of the box. + # terminal_exec — foreground command execution (Bash equivalent) + # terminal_rg — ripgrep content search (Grep equivalent) + # terminal_find — glob/find file listing (Glob equivalent) + "terminal_basic": [ + "terminal_exec", + "terminal_rg", + "terminal_find", + ], + # Terminal advanced — the power-user tools beyond the basics. Not in + # any role default; opt in explicitly per-queen via the Tool Library. + # terminal_job_* — background job lifecycle (start/manage/logs) + # terminal_output_get — fetch captured output from foreground exec + # terminal_pty_* — persistent PTY sessions (open/run/close) + "terminal_advanced": [ + "terminal_job_start", + "terminal_job_manage", + "terminal_job_logs", + "terminal_output_get", + "terminal_pty_open", + "terminal_pty_run", + "terminal_pty_close", ], # Tabular data. CSV/Excel read/write + DuckDB SQL. - "advanced_spreadsheet": [ + "spreadsheet_advanced": [ "csv_read", "csv_info", "csv_write", @@ -75,8 +94,6 @@ _TOOL_CATEGORIES: dict[str, list[str]] = { "browser_open", "browser_close", "browser_activate_tab", - "browser_close_all", - "browser_close_finished", "browser_navigate", "browser_go_back", "browser_go_forward", @@ -98,7 +115,6 @@ _TOOL_CATEGORIES: dict[str, list[str]] = { "browser_click", "browser_click_coordinate", "browser_type", - "browser_fill", "browser_type_focused", "browser_press", "browser_press_at", @@ -110,13 +126,32 @@ _TOOL_CATEGORIES: dict[str, list[str]] = { "browser_wait", "browser_resize", "browser_upload", - "browser_dialog", + ], + # Research — paper search, Wikipedia, ad-hoc web scrape. Pair with + # browser_basic for richer site-by-site research; this category is the + # lightweight always-available fallback. + "research": [ + "search_papers", + "download_paper", + "search_wikipedia", + "web_scrape", + ], + # Security — defensive scanning and reconnaissance. Engineering-only + # surface; the rest of the queens shouldn't see port scanners. + "security": [ + "port_scan", + "dns_security_scan", + "http_headers_scan", + "ssl_tls_scan", + "subdomain_enumerate", + "tech_stack_detect", + "risk_score", ], # Lightweight context helpers — good default for every queen. "time_context": [ "get_current_time", "get_account_info", - ] + ], } @@ -137,24 +172,26 @@ QUEEN_DEFAULT_CATEGORIES: dict[str, list[str]] = { # Head of Technology — builds and operates systems; full toolkit. "queen_technology": [ "file_ops", - "terminal", + "terminal_basic", "browser_basic", "browser_interaction", "research", "security", "time_context", ], - # Head of Growth — data, experiments, competitor research; no terminal/security. + # Head of Growth — data, experiments, competitor research; no security. "queen_growth": [ "file_ops", + "terminal_basic", "browser_basic", "browser_interaction", "research", "time_context", ], - # Head of Product Strategy — user research + roadmaps; no terminal/security. + # Head of Product Strategy — user research + roadmaps; no security. "queen_product_strategy": [ "file_ops", + "terminal_basic", "browser_basic", "browser_interaction", "research", @@ -163,23 +200,26 @@ QUEEN_DEFAULT_CATEGORIES: dict[str, list[str]] = { # Head of Finance — financial models (CSV/Excel heavy), market research. "queen_finance_fundraising": [ "file_ops", - "advanced_spreadsheet", + "terminal_basic", + "spreadsheet_advanced", "browser_basic", "browser_interaction", "research", "time_context", ], - # Head of Legal — reads contracts/PDFs, researches; no terminal/data/security. + # Head of Legal — reads contracts/PDFs, researches; no data/security. "queen_legal": [ "file_ops", + "terminal_basic", "browser_basic", "browser_interaction", "research", "time_context", ], - # Head of Brand & Design — visual refs, style guides; no terminal/data/security. + # Head of Brand & Design — visual refs, style guides; no data/security. "queen_brand_design": [ "file_ops", + "terminal_basic", "browser_basic", "browser_interaction", "research", @@ -188,6 +228,8 @@ QUEEN_DEFAULT_CATEGORIES: dict[str, list[str]] = { # Head of Talent — candidate pipelines, resumes; data + browser heavy. "queen_talent": [ "file_ops", + "terminal_basic", + "spreadsheet_advanced", "browser_basic", "browser_interaction", "research", @@ -196,7 +238,8 @@ QUEEN_DEFAULT_CATEGORIES: dict[str, list[str]] = { # Head of Operations — processes, automation, observability. "queen_operations": [ "file_ops", - "data", + "terminal_basic", + "spreadsheet_advanced", "browser_basic", "browser_interaction", "research", diff --git a/core/framework/agents/queen/reference/gcu_guide.md b/core/framework/agents/queen/reference/gcu_guide.md index aa5a117c..44ac32be 100644 --- a/core/framework/agents/queen/reference/gcu_guide.md +++ b/core/framework/agents/queen/reference/gcu_guide.md @@ -17,8 +17,8 @@ Use browser nodes (with `tools: {policy: "all"}`) when: ## Available Browser Tools All tools are prefixed with `browser_`: -- `browser_start`, `browser_open`, `browser_navigate` — launch/navigate -- `browser_click`, `browser_click_coordinate`, `browser_fill`, `browser_type`, `browser_type_focused` — interact +- `browser_open`, `browser_navigate` — preferred entry points; both lazy-create a browser context, so a single `browser_open(url)` covers the cold path. Use `browser_start` only to warm a profile without a URL or to recreate a context after `browser_stop`. +- `browser_click`, `browser_click_coordinate`, `browser_type`, `browser_type_focused` — interact - `browser_press` (with optional `modifiers=["ctrl"]` etc.) — keyboard shortcuts - `browser_snapshot` — compact accessibility-tree read (structured) diff --git a/core/framework/orchestrator/gcu.py b/core/framework/orchestrator/gcu.py index 46b7567d..3d6d3b48 100644 --- a/core/framework/orchestrator/gcu.py +++ b/core/framework/orchestrator/gcu.py @@ -35,7 +35,7 @@ Follow these rules for reliable, efficient browser interaction. Use snapshot first for structure and ordinary controls; switch to screenshot when snapshot can't find or verify the target. Interaction tools (`browser_click`, `browser_type`, `browser_type_focused`, -`browser_fill`, `browser_scroll`) wait 0.5 s for the page to settle +`browser_scroll`) wait 0.5 s for the page to settle after a successful action, then attach a fresh snapshot under the `snapshot` key of their result — so don't call `browser_snapshot` separately after an interaction unless you need a newer view. Tune diff --git a/core/framework/skills/_preset_skills/browser-automation/SKILL.md b/core/framework/skills/_preset_skills/browser-automation/SKILL.md index 86c24a34..7a25e8b0 100644 --- a/core/framework/skills/_preset_skills/browser-automation/SKILL.md +++ b/core/framework/skills/_preset_skills/browser-automation/SKILL.md @@ -113,7 +113,7 @@ Even after `wait_until="load"`, React/Vue SPAs often render their real chrome in ### Reading pages efficiently - **Prefer `browser_snapshot` over `browser_get_text("body")`** — returns a compact ~1–5 KB accessibility tree vs 100+ KB of raw HTML. -- Interaction tools `browser_click`, `browser_type`, `browser_type_focused`, `browser_fill`, and `browser_scroll` wait 0.5 s for the page to settle after a successful action, then attach a fresh accessibility snapshot under the `snapshot` key of their result. Use it to decide your next action — do NOT call `browser_snapshot` separately after every action. Tune the capture via `auto_snapshot_mode`: `"default"` (full tree, the default), `"simple"` (trims unnamed structural nodes), `"interactive"` (only controls — tightest token footprint), or `"off"` to skip the capture entirely (useful when batching several interactions and you don't need the intermediate trees). Call `browser_snapshot` explicitly only when you need a newer view or a different mode than what was auto-captured. +- Interaction tools `browser_click`, `browser_type`, `browser_type_focused`, and `browser_scroll` wait 0.5 s for the page to settle after a successful action, then attach a fresh accessibility snapshot under the `snapshot` key of their result. Use it to decide your next action — do NOT call `browser_snapshot` separately after every action. Tune the capture via `auto_snapshot_mode`: `"default"` (full tree, the default), `"simple"` (trims unnamed structural nodes), `"interactive"` (only controls — tightest token footprint), or `"off"` to skip the capture entirely (useful when batching several interactions and you don't need the intermediate trees). Call `browser_snapshot` explicitly only when you need a newer view or a different mode than what was auto-captured. - Complex pages (LinkedIn, Twitter/X, SPAs with virtual scrolling) can have DOMs that don't match what's visually rendered — snapshot refs may be stale, missing, or misaligned with visible layout. Try the available snapshot first; when the target is not present in that snapshot or visual position matters, switch to `browser_screenshot` to orient yourself. - Only fall back to `browser_get_text` for extracting specific small elements by CSS selector. diff --git a/scripts/browser_remote_ui.html b/scripts/browser_remote_ui.html index b0333976..623d63e6 100644 --- a/scripts/browser_remote_ui.html +++ b/scripts/browser_remote_ui.html @@ -460,9 +460,9 @@ const CATEGORIES = { 'Lifecycle': ['browser_setup', 'browser_start', 'browser_stop', 'browser_status'], 'Tabs': ['browser_tabs', 'browser_open', 'browser_close', 'browser_close_all', 'browser_close_finished', 'browser_activate_tab'], 'Navigation': ['browser_navigate', 'browser_go_back', 'browser_go_forward', 'browser_reload'], - 'Interactions': ['browser_click', 'browser_click_coordinate', 'browser_type', 'browser_type_focused', 'browser_fill', 'browser_press', 'browser_press_at', 'browser_hover', 'browser_hover_coordinate', 'browser_select', 'browser_scroll', 'browser_drag'], + 'Interactions': ['browser_click', 'browser_click_coordinate', 'browser_type', 'browser_type_focused', 'browser_press', 'browser_press_at', 'browser_hover', 'browser_hover_coordinate', 'browser_select', 'browser_scroll', 'browser_drag'], 'Inspection': ['browser_screenshot', 'browser_snapshot', 'browser_console', 'browser_html', 'browser_get_text', 'browser_get_attribute', 'browser_get_rect', 'browser_shadow_query', 'browser_evaluate', 'browser_wait'], - 'Advanced': ['browser_resize', 'browser_upload', 'browser_dialog'], + 'Advanced': ['browser_resize', 'browser_upload'], }; async function init() { diff --git a/tools/src/gcu/browser/__init__.py b/tools/src/gcu/browser/__init__.py index d95ddff8..a6184d43 100644 --- a/tools/src/gcu/browser/__init__.py +++ b/tools/src/gcu/browser/__init__.py @@ -46,9 +46,9 @@ def register_tools(mcp: FastMCP) -> None: - Navigation: browser_navigate, browser_go_back, browser_go_forward, browser_reload - Inspection: browser_screenshot, browser_snapshot, browser_console - Interactions: browser_click, browser_click_coordinate, browser_type, browser_type_focused, - browser_fill, browser_press, browser_hover, browser_select, browser_scroll, browser_drag + browser_press, browser_hover, browser_select, browser_scroll, browser_drag - Advanced: browser_wait, browser_evaluate, browser_get_text, browser_get_attribute, - browser_resize, browser_upload, browser_dialog + browser_resize, browser_upload """ register_lifecycle_tools(mcp) register_tab_tools(mcp) diff --git a/tools/src/gcu/browser/bridge_tools.py b/tools/src/gcu/browser/bridge_tools.py index 0cb4225e..56436e93 100644 --- a/tools/src/gcu/browser/bridge_tools.py +++ b/tools/src/gcu/browser/bridge_tools.py @@ -35,16 +35,6 @@ TOOL_SCHEMAS: dict[str, dict] = { "use_insert_text": {"type": "boolean", "default": True}, }, }, - "browser_fill": { - "description": "Fill an input element (clears existing content first).", - "params": { - "selector": {"type": "string", "required": True}, - "value": {"type": "string", "required": True}, - "tab_id": {"type": "integer"}, - "profile": {"type": "string"}, - "timeout_ms": {"type": "integer", "default": 30000}, - }, - }, "browser_type_focused": { "description": ( "Type text into the already-focused element. Use after browser_click_coordinate " diff --git a/tools/src/gcu/browser/tools/advanced.py b/tools/src/gcu/browser/tools/advanced.py index 90ed3fab..0c8d592f 100644 --- a/tools/src/gcu/browser/tools/advanced.py +++ b/tools/src/gcu/browser/tools/advanced.py @@ -1,5 +1,5 @@ """ -Browser advanced tools - wait, evaluate, get_text, get_attribute, resize, dialog. +Browser advanced tools - wait, evaluate, get_text, get_attribute, resize, upload. All operations go through the Beeline extension via CDP - no Playwright required. """ @@ -8,7 +8,6 @@ from __future__ import annotations import asyncio import logging -from typing import Literal from fastmcp import FastMCP @@ -394,54 +393,3 @@ def register_advanced_tools(mcp: FastMCP) -> None: } except Exception as e: return {"ok": False, "error": str(e)} - - @mcp.tool() - async def browser_dialog( - action: Literal["accept", "dismiss"] = "accept", - prompt_text: str | None = None, - tab_id: int | None = None, - profile: str | None = None, - timeout_ms: int = 30000, - ) -> dict: - """ - Handle browser dialogs (alert, confirm, prompt). - - Note: Dialog handling via CDP requires Page.javascriptDialogOpening - event handling. This sets up a one-time handler. - - Call BEFORE triggering the action that opens the dialog. - - Args: - action: How to handle - "accept" or "dismiss" - prompt_text: Text for prompt dialogs (optional) - tab_id: Chrome tab ID (default: active tab) - profile: Browser profile name (default: "default") - timeout_ms: Timeout in ms (default: 30000) - - Returns: - Dict with dialog handling result - """ - bridge = get_bridge() - if not bridge or not bridge.is_connected: - return {"ok": False, "error": "Browser extension not connected"} - - ctx = _get_context(profile) - if not ctx: - return {"ok": False, "error": "Browser not started"} - - target_tab = tab_id or ctx.get("activeTabId") - if target_tab is None: - return {"ok": False, "error": "No active tab"} - - try: - await bridge.cdp_attach(target_tab) - await bridge._cdp(target_tab, "Page.enable") - - return { - "ok": True, - "action": "handler_set", - "message": "Dialog handler prepared.", - "suggestion": "Handle dialogs manually or use browser_evaluate.", - } - except Exception as e: - return {"ok": False, "error": str(e)} diff --git a/tools/src/gcu/browser/tools/interactions.py b/tools/src/gcu/browser/tools/interactions.py index 649ee315..c34a43de 100644 --- a/tools/src/gcu/browser/tools/interactions.py +++ b/tools/src/gcu/browser/tools/interactions.py @@ -384,48 +384,6 @@ def register_interaction_tools(mcp: FastMCP) -> None: log_tool_call("browser_type", params, error=e, duration_ms=(time.perf_counter() - start) * 1000) return result - @mcp.tool() - async def browser_fill( - selector: str, - value: str, - tab_id: int | None = None, - profile: str | None = None, - timeout_ms: int = 30000, - auto_snapshot_mode: AutoSnapshotMode = "simple", - ) -> dict: - """ - Fill an input element with a value (clears existing content first). - - Faster than browser_type for filling form fields. - - Args: - selector: CSS selector for the input element - value: Value to fill - tab_id: Chrome tab ID (default: active tab) - profile: Browser profile name (default: "default") - timeout_ms: Timeout waiting for element (default: 30000) - auto_snapshot_mode: Controls the accessibility snapshot taken - 0.5s after a successful fill. ``"simple"`` (the default) - trims unnamed structural nodes; ``"default"`` returns the - full tree; ``"interactive"`` returns only controls for the - tightest token footprint; ``"off"`` skips the capture — - use when batching. - - Returns: - Dict with fill result. Includes ``snapshot`` unless - ``auto_snapshot_mode="off"`` or the fill failed. - """ - return await browser_type( - selector=selector, - text=value, - tab_id=tab_id, - profile=profile, - delay_ms=0, - clear_first=True, - timeout_ms=timeout_ms, - auto_snapshot_mode=auto_snapshot_mode, - ) - @mcp.tool() async def browser_type_focused( text: str, diff --git a/tools/src/gcu/browser/tools/lifecycle.py b/tools/src/gcu/browser/tools/lifecycle.py index b0a94dd8..903158ca 100644 --- a/tools/src/gcu/browser/tools/lifecycle.py +++ b/tools/src/gcu/browser/tools/lifecycle.py @@ -52,6 +52,49 @@ def _clear_profile_tab_caches(ctx: dict[str, Any]) -> None: clear_tab_highlights(tab_ids) +async def _ensure_context( + bridge: Any, + profile: str | None, +) -> tuple[str, dict[str, Any], bool]: + """Return ``(profile_name, ctx, created)`` for ``profile``. + + Lazy-creates the browser context (tab group + seed tab) the first time + a profile is used so URL-taking tools (``browser_open`` / + ``browser_navigate``) can be the agent's single cold-start entry + point instead of forcing an explicit ``browser_start`` round trip. + + Caller must verify ``bridge`` is connected first; any failure in + ``bridge.create_context`` propagates so the caller's existing + try/except converts it to an ``{"ok": False, ...}`` result. + """ + profile_name = _resolve_profile(profile) + existing = _contexts.get(profile_name) + if existing is not None: + return profile_name, existing, False + + result = await bridge.create_context(profile_name) + group_id = result.get("groupId") + tab_id = result.get("tabId") + + ctx: dict[str, Any] = { + "groupId": group_id, + "activeTabId": tab_id, + "_seedTabId": tab_id, # reused by first browser_open call + "tabs": {tab_id} if tab_id is not None else set(), + } + _contexts[profile_name] = ctx + + logger.info( + "Started browser context '%s': groupId=%s, tabId=%s", + profile_name, + group_id, + tab_id, + ) + log_context_event("start", profile_name, group_id=group_id, tab_id=tab_id) + + return profile_name, ctx, True + + async def shutdown_all_contexts() -> None: """Close all active browser contexts. Called at GCU server shutdown.""" if not _contexts: @@ -198,16 +241,25 @@ def register_lifecycle_tools(mcp: FastMCP) -> None: @mcp.tool() async def browser_start(profile: str | None = None) -> dict: """ - Start a browser context for the given profile. + Explicitly create a browser context (tab group) for ``profile``. - Creates a tab group in the user's Chrome via the Beeline extension. - No separate browser process is launched - uses the user's existing Chrome. + Most workflows do NOT need to call this directly: ``browser_open`` + and ``browser_navigate`` lazy-create a context on first use, so a + single ``browser_open(url)`` covers the cold path. Reach for + ``browser_start`` when you want to (a) warm a profile without + opening a URL yet, or (b) recreate a context after + ``browser_stop`` to clear stale state. + + No separate browser process is launched — uses the user's + existing Chrome via the Beeline extension. Args: profile: Browser profile name (default: "default") Returns: - Dict with start status including groupId and initial tabId + Dict with start status (``"started"`` on fresh creation, + ``"already_running"`` when a context for the profile exists), + including ``groupId`` and ``activeTabId``. """ start = time.perf_counter() params = {"profile": profile} @@ -221,14 +273,11 @@ def register_lifecycle_tools(mcp: FastMCP) -> None: log_tool_call("browser_start", params, result=result) return result - profile_name = _resolve_profile(profile) - - # Check if already running - if profile_name in _contexts: - ctx = _contexts[profile_name] + try: + profile_name, ctx, created = await _ensure_context(bridge, profile) result = { "ok": True, - "status": "already_running", + "status": "started" if created else "already_running", "profile": profile_name, "groupId": ctx.get("groupId"), "activeTabId": ctx.get("activeTabId"), @@ -240,42 +289,6 @@ def register_lifecycle_tools(mcp: FastMCP) -> None: duration_ms=(time.perf_counter() - start) * 1000, ) return result - - try: - result = await bridge.create_context(profile_name) - group_id = result.get("groupId") - tab_id = result.get("tabId") - - _contexts[profile_name] = { - "groupId": group_id, - "activeTabId": tab_id, - "_seedTabId": tab_id, # reused by first browser_open call - "tabs": {tab_id} if tab_id is not None else set(), - } - - logger.info( - "Started browser context '%s': groupId=%s, tabId=%s", - profile_name, - group_id, - tab_id, - ) - - log_context_event("start", profile_name, group_id=group_id, tab_id=tab_id) - - result = { - "ok": True, - "status": "started", - "profile": profile_name, - "groupId": group_id, - "activeTabId": tab_id, - } - log_tool_call( - "browser_start", - params, - result=result, - duration_ms=(time.perf_counter() - start) * 1000, - ) - return result except Exception as e: logger.exception("Failed to start browser context") result = {"ok": False, "error": str(e)} diff --git a/tools/src/gcu/browser/tools/navigation.py b/tools/src/gcu/browser/tools/navigation.py index a3197de4..cfa8bcb2 100644 --- a/tools/src/gcu/browser/tools/navigation.py +++ b/tools/src/gcu/browser/tools/navigation.py @@ -14,6 +14,7 @@ from fastmcp import FastMCP from ..bridge import get_bridge from ..telemetry import log_tool_call +from .lifecycle import _ensure_context from .tabs import _get_context logger = logging.getLogger(__name__) @@ -32,8 +33,14 @@ def register_navigation_tools(mcp: FastMCP) -> None: """ Navigate a tab to a URL. - This tool waits for the page to reach the ``wait_until`` condition - before returning. + Lazy-creates a browser context if none exists (no need to call + ``browser_start`` first); when no ``tab_id`` is given and the + context was just created, navigation lands on the seed tab. + Prefer ``browser_open`` when you specifically want a new tab — + ``browser_navigate`` is for redirecting an existing tab. + + Waits for the page to reach the ``wait_until`` condition before + returning. Args: url: URL to navigate to @@ -54,10 +61,16 @@ def register_navigation_tools(mcp: FastMCP) -> None: log_tool_call("browser_navigate", params, result=result) return result - ctx = _get_context(profile) - if not ctx: - result = {"ok": False, "error": "Browser not started. Call browser_start first."} - log_tool_call("browser_navigate", params, result=result) + try: + _, ctx, _ = await _ensure_context(bridge, profile) + except Exception as e: + result = {"ok": False, "error": str(e)} + log_tool_call( + "browser_navigate", + params, + error=e, + duration_ms=(time.perf_counter() - start) * 1000, + ) return result target_tab = tab_id or ctx.get("activeTabId") diff --git a/tools/src/gcu/browser/tools/tabs.py b/tools/src/gcu/browser/tools/tabs.py index 90b624b2..7432fa64 100644 --- a/tools/src/gcu/browser/tools/tabs.py +++ b/tools/src/gcu/browser/tools/tabs.py @@ -16,7 +16,7 @@ from pydantic import Field from ..bridge import get_bridge from ..session import _active_profile from ..telemetry import log_tool_call -from .lifecycle import _contexts +from .lifecycle import _contexts, _ensure_context logger = logging.getLogger(__name__) @@ -98,10 +98,14 @@ def register_tab_tools(mcp: FastMCP) -> None: profile: str | None = None, ) -> dict: """ - Open a new browser tab and navigate to the given URL. + Open a browser tab at the given URL — preferred entry point. - The tab is automatically added to the agent's tab group. - This tool waits for the page to load before returning. + This is the agent's primary "go to a page" tool. If no browser + context exists yet for the profile, one is created transparently + (no need to call ``browser_start`` first). The first call after + a fresh context reuses the seed ``about:blank`` tab; subsequent + calls open new tabs in the agent's tab group. Waits for the + page to load before returning. Args: url: URL to navigate to @@ -120,13 +124,8 @@ def register_tab_tools(mcp: FastMCP) -> None: log_tool_call("browser_open", params, result=result) return result - ctx = _get_context(profile) - if not ctx: - result = {"ok": False, "error": "Browser not started. Call browser_start first."} - log_tool_call("browser_open", params, result=result) - return result - try: + _, ctx, _ = await _ensure_context(bridge, profile) # Reuse the seed about:blank tab from context.create on first open seed_tab = ctx.pop("_seedTabId", None) if seed_tab is not None: