feat: improve web search and consolidate browser open

2026-05-01 14:55:20 -07:00
parent b939a875a7
commit a09eac06f1
21 changed files with 414 additions and 168 deletions
@@ -61,7 +61,7 @@ All replies carry `{ id, result }` or `{ id, error }`.
 # 1. At GCU server startup, open ws://localhost:9229/beeline and wait for
 #    the extension to connect (sends { type: "hello" }).
 #
-# 2. On browser_start(profile):
+# 2. On the first browser tool call for a profile (lazy-start via _ensure_context):
 #    - Send { id, type: "context.create", agentId: profile }
 #    - Receive { groupId, tabId }
 #    - Store groupId in the session object (no Chrome process, no CDP port)
@@ -10,12 +10,14 @@ Validates URLs against internal network ranges to prevent SSRF attacks.
 from __future__ import annotations

 import ipaddress
+import json
+import re
 import socket
 from typing import Any
 from urllib.parse import urljoin, urlparse
 from urllib.robotparser import RobotFileParser

-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, NavigableString
 from fastmcp import FastMCP
 from playwright.async_api import (
    Error as PlaywrightError,
@@ -82,6 +84,7 @@ def register_tools(mcp: FastMCP) -> None:
        selector: str | None = None,
        include_links: bool = False,
        max_length: int = 50000,
+        offset: int = 0,
        respect_robots_txt: bool = True,
    ) -> dict:
        """
@@ -94,12 +97,18 @@ def register_tools(mcp: FastMCP) -> None:
        Args:
            url: URL of the webpage to scrape
            selector: CSS selector to target specific content (e.g., 'article', '.main-content')
-            include_links: Include extracted links in the response
-            max_length: Maximum length of extracted text (1000-500000)
+            include_links: When True, links are inlined as `[text](url)` in
+                content and also returned as a `links` list
+            max_length: Maximum length of extracted text returned in this call (1000-500000)
+            offset: Character offset into the extracted text. Use with
+                `next_offset` from a prior truncated result to paginate.
            respect_robots_txt: Whether to respect robots.txt rules (default True)

        Returns:
-            Dict with scraped content (url, title, description, content, length) or error dict
+            Dict with: url, final_url, title, description, page_type
+            (article|listing|page), content, length, offset, total_length,
+            truncated, next_offset, headings, structured_data (json_ld + open_graph),
+            and optionally links. On error, returns {"error": str, ...} with a hint when applicable.
        """
        try:
            # Validate URL
@@ -128,6 +137,10 @@ def register_tools(mcp: FastMCP) -> None:
                            "error": f"Blocked by robots.txt: {url}",
                            "url": url,
                            "skipped": True,
+                            "hint": (
+                                "Pass respect_robots_txt=False if you have "
+                                "authorization to scrape this site."
+                            ),
                        }
                except Exception:
                    pass  # If robots.txt can't be fetched, proceed anyway
@@ -195,7 +208,17 @@ def register_tools(mcp: FastMCP) -> None:
                        return {"error": "Navigation failed: no response received"}

                    if response.status != 200:
-                        return {"error": f"HTTP {response.status}: Failed to fetch URL"}
+                        hint = (
+                            "Site likely requires auth, blocks bots, or is rate-limiting."
+                            if response.status in (401, 403, 429)
+                            else "Resource may not exist or server may be down."
+                        )
+                        return {
+                            "error": f"HTTP {response.status}: Failed to fetch URL",
+                            "url": url,
+                            "status": response.status,
+                            "hint": hint,
+                        }

                    content_type = response.headers.get("content-type", "").lower()
                    if not any(t in content_type for t in ["text/html", "application/xhtml+xml"]):
@@ -218,63 +241,165 @@ def register_tools(mcp: FastMCP) -> None:

            # Parse rendered HTML with BeautifulSoup
            soup = BeautifulSoup(html_content, "html.parser")
+            base_url = str(response.url)  # Final URL after redirects
+
+            # Extract structured data BEFORE noise removal — JSON-LD lives
+            # in <script>, which gets decomposed below. JSON-LD is often the
+            # cleanest source of structured info on listing pages.
+            json_ld: list[Any] = []
+            for script in soup.find_all("script", type="application/ld+json"):
+                raw = script.string or script.get_text() or ""
+                if raw.strip():
+                    try:
+                        json_ld.append(json.loads(raw))
+                    except (json.JSONDecodeError, TypeError):
+                        pass
+
+            open_graph: dict[str, str] = {}
+            for meta in soup.find_all("meta"):
+                prop = (meta.get("property") or "").strip()
+                if prop.startswith("og:"):
+                    val = (meta.get("content") or "").strip()
+                    if val:
+                        open_graph[prop[3:]] = val

            # Remove noise elements
            for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "iframe"]):
                tag.decompose()

-            # Get title and description
+            # Get title and description (fall back to OG description)
            title = soup.title.get_text(strip=True) if soup.title else ""
-
            description = ""
            meta_desc = soup.find("meta", attrs={"name": "description"})
            if meta_desc:
-                description = meta_desc.get("content", "")
+                description = meta_desc.get("content", "") or ""
+            if not description:
+                description = open_graph.get("description", "")

-            # Target content
+            # Headings outline (capped) — lets the agent drill in via selector
+            headings: list[dict[str, Any]] = []
+            for h in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
+                h_text = h.get_text(strip=True)
+                if h_text:
+                    headings.append({"level": int(h.name[1]), "text": h_text})
+                if len(headings) >= 100:
+                    break
+
+            # Page-type heuristic: many <article> blocks → listing page
+            article_count = len(soup.find_all("article"))
+            if article_count >= 3:
+                page_type = "listing"
+            elif article_count == 1 or soup.find("main"):
+                page_type = "article"
+            else:
+                page_type = "page"
+
+            # Locate target subtree
            if selector:
                content_elem = soup.select_one(selector)
                if not content_elem:
-                    return {"error": f"No elements found matching selector: {selector}"}
-                text = content_elem.get_text(separator=" ", strip=True)
+                    return {
+                        "error": f"No elements found matching selector: {selector}",
+                        "url": url,
+                        "hint": "Try a broader selector or omit selector to use auto-detection.",
+                    }
            else:
-                # Auto-detect main content
-                main_content = (
-                    soup.find("article")
-                    or soup.find("main")
+                # Prefer <main> over the first <article> — on listing pages
+                # the latter would drop every article after the first.
+                content_elem = (
+                    soup.find("main")
                    or soup.find(attrs={"role": "main"})
+                    or soup.find("article")
                    or soup.find(class_=["content", "post", "entry", "article-body"])
                    or soup.find("body")
                )
-                text = main_content.get_text(separator=" ", strip=True) if main_content else ""

-            # Clean up whitespace
-            text = " ".join(text.split())
+            # Collect link metadata BEFORE rewriting anchors (rewriting
+            # replaces <a> elements with NavigableStrings, so find_all('a')
+            # would miss them after).
+            links: list[dict[str, str]] = []
+            if content_elem and include_links:
+                for a in content_elem.find_all("a", href=True)[:50]:
+                    link_text = a.get_text(strip=True)
+                    href = urljoin(base_url, a["href"])
+                    if link_text and href:
+                        links.append({"text": link_text, "href": href})

-            # Truncate if needed (reserve 3 chars for the ellipsis so the
-            # final string stays within max_length)
-            if len(text) > max_length:
-                text = text[: max_length - 3] + "..."
+            text = ""
+            if content_elem:
+                # Inline anchors as [text](url) so links survive text
+                # extraction (otherwise the agent has to correlate `links`
+                # against the text blob).
+                if include_links:
+                    for a in content_elem.find_all("a", href=True):
+                        link_text = a.get_text(strip=True)
+                        if link_text:
+                            href = urljoin(base_url, a["href"])
+                            a.replace_with(NavigableString(f"[{link_text}]({href})"))
+
+                # Convert <br> and block elements into newlines so the output
+                # preserves paragraph/list/heading structure rather than
+                # collapsing into one giant whitespace-joined string.
+                for br in content_elem.find_all("br"):
+                    br.replace_with(NavigableString("\n"))
+                block_tags = (
+                    "p", "h1", "h2", "h3", "h4", "h5", "h6",
+                    "li", "tr", "div", "section", "article", "blockquote",
+                )
+                for block in content_elem.find_all(block_tags):
+                    block.insert_before(NavigableString("\n"))
+                    block.append(NavigableString("\n"))
+
+                raw_text = content_elem.get_text(separator=" ")
+
+                # Normalize: squash spaces within each line, collapse runs of
+                # blank lines to a single blank, trim.
+                cleaned: list[str] = []
+                blank = True  # swallow leading blanks
+                for line in raw_text.split("\n"):
+                    line = re.sub(r"[ \t]+", " ", line).strip()
+                    if line:
+                        cleaned.append(line)
+                        blank = False
+                    elif not blank:
+                        cleaned.append("")
+                        blank = True
+                text = "\n".join(cleaned).strip()
+
+            # Apply offset/truncation with continuation metadata. Reserve 3
+            # chars for the ellipsis so the returned string stays within
+            # max_length (back-compat with existing test expectations).
+            total_length = len(text)
+            offset = max(0, min(offset, total_length))
+            end = offset + max_length
+            truncated = end < total_length
+            sliced = text[offset:end]
+            if truncated and len(sliced) >= 3:
+                sliced = sliced[: -3] + "..."
+
+            structured_data: dict[str, Any] = {}
+            if json_ld:
+                structured_data["json_ld"] = json_ld
+            if open_graph:
+                structured_data["open_graph"] = open_graph

            result: dict[str, Any] = {
                "url": url,
+                "final_url": base_url,
                "title": title,
                "description": description,
-                "content": text,
-                "length": len(text),
+                "page_type": page_type,
+                "content": sliced,
+                "length": len(sliced),
+                "offset": offset,
+                "total_length": total_length,
+                "truncated": truncated,
+                "next_offset": end if truncated else None,
+                "headings": headings,
            }
-
-            # Extract links if requested
+            if structured_data:
+                result["structured_data"] = structured_data
            if include_links:
-                links: list[dict[str, str]] = []
-                base_url = str(response.url)  # Use final URL after redirects
-                for a in soup.find_all("a", href=True)[:50]:
-                    href = a["href"]
-                    # Convert relative URLs to absolute URLs
-                    absolute_href = urljoin(base_url, href)
-                    link_text = a.get_text(strip=True)
-                    if link_text and absolute_href:
-                        links.append({"text": link_text, "href": absolute_href})
                result["links"] = links

            return result
@@ -41,7 +41,7 @@ def register_tools(mcp: FastMCP) -> None:
    """Register all GCU browser tools with the MCP server.

    Tools are organized into categories:
-    - Lifecycle: browser_start, browser_stop, browser_status
+    - Lifecycle: browser_setup, browser_status, browser_stop (browser_open lazy-creates the context)
    - Tabs: browser_tabs, browser_open, browser_close, browser_activate_tab
    - Navigation: browser_navigate, browser_go_back, browser_go_forward, browser_reload
    - Inspection: browser_screenshot, browser_snapshot, browser_console
@@ -642,7 +642,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_snapshot", params, result=result)
            return result

@@ -727,7 +727,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_html", params, result=result)
            return result

@@ -153,7 +153,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_click", params, result=result)
            return result

@@ -247,7 +247,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_click_coordinate", params, result=result)
            return _text_only(result)

@@ -352,7 +352,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_type", params, result=result)
            return result

@@ -432,7 +432,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_type_focused", params, result=result)
            return result

@@ -506,7 +506,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_press", params, result=result)
            return result

@@ -560,7 +560,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_hover", params, result=result)
            return result

@@ -627,7 +627,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_hover_coordinate", params, result=result)
            return _text_only(result)

@@ -712,7 +712,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_press_at", params, result=result)
            return _text_only(result)

@@ -782,7 +782,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_select", params, result=result)
            return result

@@ -860,7 +860,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_scroll", params, result=result)
            return result

@@ -924,7 +924,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_drag", params, result=result)
            return result

@@ -61,7 +61,7 @@ async def _ensure_context(
    Lazy-creates the browser context (tab group + seed tab) the first time
    a profile is used so URL-taking tools (``browser_open`` /
    ``browser_navigate``) can be the agent's single cold-start entry
-    point instead of forcing an explicit ``browser_start`` round trip.
+    point — no separate "start" tool to remember.

    Caller must verify ``bridge`` is connected first; any failure in
    ``bridge.create_context`` propagates so the caller's existing
@@ -137,7 +137,7 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
            return {
                "ok": True,
                "connected": True,
-                "status": "Extension is connected and ready. Call browser_start to begin.",
+                "status": "Extension is connected and ready. Call browser_open(url) to begin.",
            }

        return {
@@ -150,7 +150,7 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
                "step_3": "Click 'Load unpacked'",
                "step_4": f"Select this directory: {ext_path}",
                "step_5": ("Click the extension icon in the Chrome toolbar to confirm it says 'Connected'"),
-                "step_6": "Return here and call browser_start",
+                "step_6": "Return here and call browser_open(url) to begin",
            },
            "extensionPath": ext_path,
            "extensionPathExists": ext_exists,
@@ -238,63 +238,6 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
        )
        return result

-    @mcp.tool()
-    async def browser_start(profile: str | None = None) -> dict:
-        """
-        Explicitly create a browser context (tab group) for ``profile``.
-
-        Most workflows do NOT need to call this directly: ``browser_open``
-        and ``browser_navigate`` lazy-create a context on first use, so a
-        single ``browser_open(url)`` covers the cold path. Reach for
-        ``browser_start`` when you want to (a) warm a profile without
-        opening a URL yet, or (b) recreate a context after
-        ``browser_stop`` to clear stale state.
-
-        No separate browser process is launched — uses the user's
-        existing Chrome via the Beeline extension.
-
-        Args:
-            profile: Browser profile name (default: "default")
-
-        Returns:
-            Dict with start status (``"started"`` on fresh creation,
-            ``"already_running"`` when a context for the profile exists),
-            including ``groupId`` and ``activeTabId``.
-        """
-        start = time.perf_counter()
-        params = {"profile": profile}
-
-        bridge = get_bridge()
-        if not bridge or not bridge.is_connected:
-            result = {
-                "ok": False,
-                "error": ("Browser extension not connected. Call browser_setup for installation instructions."),
-            }
-            log_tool_call("browser_start", params, result=result)
-            return result
-
-        try:
-            profile_name, ctx, created = await _ensure_context(bridge, profile)
-            result = {
-                "ok": True,
-                "status": "started" if created else "already_running",
-                "profile": profile_name,
-                "groupId": ctx.get("groupId"),
-                "activeTabId": ctx.get("activeTabId"),
-            }
-            log_tool_call(
-                "browser_start",
-                params,
-                result=result,
-                duration_ms=(time.perf_counter() - start) * 1000,
-            )
-            return result
-        except Exception as e:
-            logger.exception("Failed to start browser context")
-            result = {"ok": False, "error": str(e)}
-            log_tool_call("browser_start", params, error=e, duration_ms=(time.perf_counter() - start) * 1000)
-            return result
-
    @mcp.tool()
    async def browser_stop(profile: str | None = None) -> dict:
        """
@@ -33,11 +33,10 @@ def register_navigation_tools(mcp: FastMCP) -> None:
        """
        Navigate a tab to a URL.

-        Lazy-creates a browser context if none exists (no need to call
-        ``browser_start`` first); when no ``tab_id`` is given and the
-        context was just created, navigation lands on the seed tab.
-        Prefer ``browser_open`` when you specifically want a new tab —
-        ``browser_navigate`` is for redirecting an existing tab.
+        Lazy-creates a browser context if none exists; when no ``tab_id``
+        is given and the context was just created, navigation lands on
+        the seed tab. Prefer ``browser_open`` when you specifically want
+        a new tab — ``browser_navigate`` is for redirecting an existing tab.

        Waits for the page to reach the ``wait_until`` condition before
        returning.
@@ -130,7 +129,7 @@ def register_navigation_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_go_back", params, result=result)
            return result

@@ -180,7 +179,7 @@ def register_navigation_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_go_forward", params, result=result)
            return result

@@ -235,7 +234,7 @@ def register_navigation_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_reload", params, result=result)
            return result

@@ -65,7 +65,7 @@ def register_tab_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_tabs", params, result=result)
            return result

@@ -100,12 +100,12 @@ def register_tab_tools(mcp: FastMCP) -> None:
        """
        Open a browser tab at the given URL — preferred entry point.

-        This is the agent's primary "go to a page" tool. If no browser
-        context exists yet for the profile, one is created transparently
-        (no need to call ``browser_start`` first). The first call after
-        a fresh context reuses the seed ``about:blank`` tab; subsequent
-        calls open new tabs in the agent's tab group. Waits for the
-        page to load before returning.
+        This is the agent's primary "go to a page" tool and the cold-start
+        entry point — if no browser context exists yet for the profile,
+        one is created transparently. The first call after a fresh
+        context reuses the seed ``about:blank`` tab; subsequent calls
+        open new tabs in the agent's tab group. Waits for the page to
+        load before returning.

        Args:
            url: URL to navigate to
@@ -192,7 +192,7 @@ def register_tab_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_close", params, result=result)
            return result

@@ -271,7 +271,7 @@ def register_tab_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_activate_tab", params, result=result)
            return result

@@ -107,22 +107,17 @@ class TestMultipleSubagentsTabGroups:

        mock_bridge.create_context = AsyncMock(side_effect=mock_create_context)

-        # Register tools first
-        register_lifecycle_tools(mcp)
-        browser_start = mcp._tool_manager._tools["browser_start"].fn
+        from gcu.browser.tools.lifecycle import _ensure_context

-        # Now patch for execution
-        with patch("gcu.browser.tools.lifecycle.get_bridge", return_value=mock_bridge):
-            # Simulate 3 different subagents starting browsers
-            results = await asyncio.gather(
-                browser_start(profile="agent_1"),
-                browser_start(profile="agent_2"),
-                browser_start(profile="agent_3"),
-            )
+        results = await asyncio.gather(
+            _ensure_context(mock_bridge, "agent_1"),
+            _ensure_context(mock_bridge, "agent_2"),
+            _ensure_context(mock_bridge, "agent_3"),
+        )

        # Each should have created a separate context
        assert mock_bridge.create_context.call_count == 3
-        assert all(r.get("ok") for r in results)
+        assert all(created for (_, _, created) in results)

    @pytest.mark.asyncio
    async def test_concurrent_tab_operations_different_groups(self, mcp: FastMCP, mock_bridge: MagicMock):
@@ -709,11 +704,11 @@ class TestErrorHandling:
        mock_bridge = MagicMock(spec=BeelineBridge)
        mock_bridge.is_connected = False

-        register_lifecycle_tools(mcp)
-        browser_start = mcp._tool_manager._tools["browser_start"].fn
+        register_tab_tools(mcp)
+        browser_open = mcp._tool_manager._tools["browser_open"].fn

-        with patch("gcu.browser.tools.lifecycle.get_bridge", return_value=mock_bridge):
-            result = await browser_start(profile="test")
+        with patch("gcu.browser.tools.tabs.get_bridge", return_value=mock_bridge):
+            result = await browser_open(url="https://example.com", profile="test")

        assert result.get("ok") is False
        assert "not connected" in result.get("error", "").lower()
@@ -374,6 +374,190 @@ class TestWebScrapeToolLinkConversion:
        assert len([t for t in texts if not t.strip()]) == 0


+class TestWebScrapeToolAIFriendlyOutput:
+    """Tests for the AI-friendly output additions: structured data,
+    headings, page_type, block-level newlines, inline links, truncation
+    metadata, and offset-based pagination."""
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_block_level_newlines_preserved(self, mock_pw, mock_stealth, web_scrape_fn):
+        """Block elements (p, h1, li) produce newlines, not space-collapsed."""
+        html = """
+        <html><body>
+            <h1>Title</h1>
+            <p>First paragraph.</p>
+            <p>Second paragraph.</p>
+            <ul><li>Item one</li><li>Item two</li></ul>
+        </body></html>
+        """
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com")
+        assert "error" not in result
+        content = result["content"]
+        assert "Title" in content
+        assert "First paragraph." in content
+        assert "Second paragraph." in content
+        # Block separation should produce newlines, not run paragraphs together
+        assert "First paragraph.\n" in content or "First paragraph.\n\nSecond" in content
+        assert "Item one" in content and "Item two" in content
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_headings_outline_returned(self, mock_pw, mock_stealth, web_scrape_fn):
+        """Headings outline lists h1-h6 with level + text."""
+        html = """
+        <html><body>
+            <h1>Top</h1>
+            <h2>Section A</h2>
+            <h3>Sub A1</h3>
+        </body></html>
+        """
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com")
+        assert result["headings"] == [
+            {"level": 1, "text": "Top"},
+            {"level": 2, "text": "Section A"},
+            {"level": 3, "text": "Sub A1"},
+        ]
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_inline_links_when_include_links(self, mock_pw, mock_stealth, web_scrape_fn):
+        """include_links=True inlines anchors as [text](url) in content."""
+        html = """
+        <html><body>
+            <p>See <a href="/docs">our docs</a> for details.</p>
+        </body></html>
+        """
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com", include_links=True)
+        assert "[our docs](https://example.com/docs)" in result["content"]
+        # Separate links list still present for back-compat
+        assert any(link["text"] == "our docs" for link in result["links"])
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_structured_data_json_ld(self, mock_pw, mock_stealth, web_scrape_fn):
+        """JSON-LD blocks are parsed and surfaced under structured_data."""
+        html = """
+        <html><head>
+            <script type="application/ld+json">
+            {"@type": "Article", "headline": "Hello"}
+            </script>
+        </head><body><p>body</p></body></html>
+        """
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com")
+        assert "structured_data" in result
+        assert result["structured_data"]["json_ld"] == [
+            {"@type": "Article", "headline": "Hello"}
+        ]
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_structured_data_open_graph(self, mock_pw, mock_stealth, web_scrape_fn):
+        """OpenGraph meta tags are surfaced under structured_data.open_graph."""
+        html = """
+        <html><head>
+            <meta property="og:title" content="OG Title">
+            <meta property="og:type" content="article">
+        </head><body><p>body</p></body></html>
+        """
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com")
+        assert result["structured_data"]["open_graph"] == {
+            "title": "OG Title",
+            "type": "article",
+        }
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_truncation_metadata(self, mock_pw, mock_stealth, web_scrape_fn):
+        """Truncated responses set truncated/total_length/next_offset."""
+        html = f"<html><body>{'a' * 5000}</body></html>"
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com", max_length=1000)
+        assert result["truncated"] is True
+        assert result["total_length"] == 5000
+        assert result["next_offset"] == 1000
+        assert result["offset"] == 0
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_offset_pagination(self, mock_pw, mock_stealth, web_scrape_fn):
+        """offset arg returns content starting from the given character."""
+        body = "a" * 1000 + "b" * 1000 + "c" * 1000
+        html = f"<html><body>{body}</body></html>"
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com", max_length=1000, offset=1000)
+        assert result["offset"] == 1000
+        # Window should start in the b-region
+        assert result["content"].startswith("b")
+        assert result["truncated"] is True
+        assert result["next_offset"] == 2000
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_page_type_listing(self, mock_pw, mock_stealth, web_scrape_fn):
+        """3+ <article> elements => page_type 'listing'."""
+        html = """
+        <html><body>
+            <article><h2>Post 1</h2></article>
+            <article><h2>Post 2</h2></article>
+            <article><h2>Post 3</h2></article>
+        </body></html>
+        """
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com")
+        assert result["page_type"] == "listing"
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_page_type_article(self, mock_pw, mock_stealth, web_scrape_fn):
+        """Single <article> => page_type 'article'."""
+        html = "<html><body><article><p>Hello</p></article></body></html>"
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com")
+        assert result["page_type"] == "article"
+
+
 class TestWebScrapeToolErrorHandling:
    """Tests for error handling and early exit before JS wait."""

@@ -388,7 +572,9 @@ class TestWebScrapeToolErrorHandling:
        mock_stealth.return_value.apply_stealth_async = AsyncMock()

        result = await web_scrape_fn(url="https://example.com/missing")
-        assert result == {"error": "HTTP 404: Failed to fetch URL"}
+        assert result["error"] == "HTTP 404: Failed to fetch URL"
+        assert result["status"] == 404
+        assert "hint" in result
        mock_page.wait_for_load_state.assert_not_called()

    @pytest.mark.asyncio