feat: improve web search and consolidate browser open

2026-05-01 14:55:20 -07:00
parent b939a875a7
commit a09eac06f1
21 changed files with 414 additions and 168 deletions
@@ -47,7 +47,6 @@
      "Bash(grep -v ':0$')",
      "Bash(curl -s -m 2 http://127.0.0.1:4002/sse -o /dev/null -w 'status=%{http_code} time=%{time_total}s\\\\n')",
      "mcp__gcu-tools__browser_status",
-      "mcp__gcu-tools__browser_start",
      "mcp__gcu-tools__browser_navigate",
      "mcp__gcu-tools__browser_evaluate",
      "mcp__gcu-tools__browser_screenshot",
@@ -214,7 +214,7 @@ Curated list of known browser automation edge cases with symptoms, causes, and f
 | **Symptom** | `browser_open()` returns `"No group with id: XXXXXXX"` even though `browser_status` shows `running: true` |
 | **Root Cause** | In-memory `_contexts` dict has a stale `groupId` from a Chrome tab group that was closed outside the tool (e.g. user closed the tab group) |
 | **Detection** | `browser_status` returns `running: true` but `browser_open` fails with "No group with id" |
-| **Fix** | Call `browser_stop()` to clear stale context from `_contexts`, then `browser_start()` again |
+| **Fix** | Call `browser_stop()` to clear stale context from `_contexts`, then `browser_open(url)` to lazy-create a fresh one |
 | **Code** | `tools/lifecycle.py:144-160` - `already_running` check uses cached dict without validating against Chrome |
 | **Verified** | 2026-04-03 ✓ |

@@ -249,7 +249,7 @@ or find files. Mtime-sorted in files mode.

 ## Browser Automation (gcu-tools MCP)
 - Use `browser_*` tools — `browser_open(url)` is the cold-start entry point \
-  (lazy-creates the context; no `browser_start` first). Then `browser_navigate`, \
+  (lazy-creates the context; no separate "start" call). Then `browser_navigate`, \
  `browser_click`, `browser_type`, `browser_snapshot`, \
  <!-- vision-only -->`browser_screenshot`, <!-- /vision-only -->`browser_scroll`, \
  `browser_tabs`, `browser_close`, `browser_evaluate`, etc.
@@ -88,7 +88,6 @@ _TOOL_CATEGORIES: dict[str, list[str]] = {
    "browser_basic": [
        "browser_setup",
        "browser_status",
-        "browser_start",
        "browser_stop",
        "browser_tabs",
        "browser_open",
@@ -17,7 +17,7 @@ Use browser nodes (with `tools: {policy: "all"}`) when:
 ## Available Browser Tools

 All tools are prefixed with `browser_`:
- `browser_open`, `browser_navigate` — preferred entry points; both lazy-create a browser context, so a single `browser_open(url)` covers the cold path. Use `browser_start` only to warm a profile without a URL or to recreate a context after `browser_stop`.
+- `browser_open`, `browser_navigate` — both lazy-create the browser context, so a single `browser_open(url)` covers the cold path. To recover from a stale context, call `browser_stop` then `browser_open(url)` again.
 - `browser_click`, `browser_click_coordinate`, `browser_type`, `browser_type_focused` — interact
 - `browser_press` (with optional `modifiers=["ctrl"]` etc.) — keyboard shortcuts
 - `browser_snapshot` — compact accessibility-tree read (structured)
@@ -158,7 +158,7 @@ cookie consent banners if they block content.
 - If `browser_snapshot` fails, try `browser_get_text` with a narrow
  selector as fallback.
 - If `browser_open` fails or the page seems stale, `browser_stop` →
-  `browser_start` → retry.
+  `browser_open(url)` to lazy-create a fresh context.

 ## `browser_evaluate`

@@ -410,7 +410,7 @@ In all of these cases the script is SHORT (< 10 lines) and the result is CONSUME
 - If a tool fails, retry once with the same approach.
 - If it fails a second time, STOP retrying and switch approach.
 - If `browser_snapshot` fails, try `browser_get_text` with a specific small selector as fallback.
- If `browser_open` fails or page seems stale, `browser_stop`, then `browser_start`, then retry.
+- If `browser_open` fails or page seems stale, `browser_stop`, then `browser_open(url)` again to recreate a fresh context.

 ## Verified workflows

@@ -17,16 +17,15 @@ map_search_gcu = NodeSpec(
 You are a browser agent. Your job: Search Google Maps for the provided query and extract business names and website URLs.

 ## Workflow
-1. browser_start
-2. browser_open(url="https://www.google.com/maps")
-3. use the url query to search for the keyword
-3.1 alternatively, use browser_type or browser_click to search for the "query" in memory.'
-4. browser_wait(seconds=3)
-5. browser_snapshot to find the list of results.
-6. For each relevant result, extract:
+1. browser_open(url="https://www.google.com/maps")  # lazy-creates the context
+2. use the url query to search for the keyword
+2.1 alternatively, use browser_type or browser_click to search for the "query" in memory.'
+3. browser_wait(seconds=3)
+4. browser_snapshot to find the list of results.
+5. For each relevant result, extract:
   - Name of the business
   - Website URL (look for the website icon/link)
-7. set_output("business_list", [{"name": "...", "website": "..."}, ...])
+6. set_output("business_list", [{"name": "...", "website": "..."}, ...])

 ## Constraints
 - Extract at least 5-10 businesses if possible.
@@ -24,13 +24,12 @@ Focus on:
 - Hardware/Silicon breakthroughs

 ## Instructions
-1. browser_start
-2. For each handle:
-   a. browser_open(url=f"https://x.com/{handle}")
+1. For each handle:
+   a. browser_open(url=f"https://x.com/{handle}")  # lazy-creates the context on first call
   b. browser_wait(seconds=5)
   c. browser_snapshot
   d. Parse relevant tech news text
-3. set_output("raw_tweets", consolidated_json)
+2. set_output("raw_tweets", consolidated_json)
 """,
 )

@@ -244,12 +244,14 @@ def main() -> None:
        logger.error("Failed to connect to GCU server: %s", e)
        sys.exit(1)

-    # Auto-start browser context so tools work immediately
+    # Warm the browser context so the first interactive call doesn't pay the
+    # cold-start round trip. about:blank lazy-creates the context just like
+    # a real URL would, without committing to a destination page.
    try:
-        result = client.call_tool("browser_start", {})
-        logger.info("browser_start: %s", result)
+        result = client.call_tool("browser_open", {"url": "about:blank"})
+        logger.info("browser_open(about:blank): %s", result)
    except Exception as e:
-        logger.warning("browser_start failed (may already be started): %s", e)
+        logger.warning("browser warm-up failed (may already be running): %s", e)

    app = create_app()

@@ -457,7 +457,7 @@ let currentView = 'grid';

 // Tool categories for sidebar grouping
 const CATEGORIES = {
-  'Lifecycle': ['browser_setup', 'browser_start', 'browser_stop', 'browser_status'],
+  'Lifecycle': ['browser_setup', 'browser_stop', 'browser_status'],
  'Tabs': ['browser_tabs', 'browser_open', 'browser_close', 'browser_close_all', 'browser_close_finished', 'browser_activate_tab'],
  'Navigation': ['browser_navigate', 'browser_go_back', 'browser_go_forward', 'browser_reload'],
  'Interactions': ['browser_click', 'browser_click_coordinate', 'browser_type', 'browser_type_focused', 'browser_press', 'browser_press_at', 'browser_hover', 'browser_hover_coordinate', 'browser_select', 'browser_scroll', 'browser_drag'],
@@ -61,7 +61,7 @@ All replies carry `{ id, result }` or `{ id, error }`.
 # 1. At GCU server startup, open ws://localhost:9229/beeline and wait for
 #    the extension to connect (sends { type: "hello" }).
 #
-# 2. On browser_start(profile):
+# 2. On the first browser tool call for a profile (lazy-start via _ensure_context):
 #    - Send { id, type: "context.create", agentId: profile }
 #    - Receive { groupId, tabId }
 #    - Store groupId in the session object (no Chrome process, no CDP port)
@@ -10,12 +10,14 @@ Validates URLs against internal network ranges to prevent SSRF attacks.
 from __future__ import annotations

 import ipaddress
+import json
+import re
 import socket
 from typing import Any
 from urllib.parse import urljoin, urlparse
 from urllib.robotparser import RobotFileParser

-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, NavigableString
 from fastmcp import FastMCP
 from playwright.async_api import (
    Error as PlaywrightError,
@@ -82,6 +84,7 @@ def register_tools(mcp: FastMCP) -> None:
        selector: str | None = None,
        include_links: bool = False,
        max_length: int = 50000,
+        offset: int = 0,
        respect_robots_txt: bool = True,
    ) -> dict:
        """
@@ -94,12 +97,18 @@ def register_tools(mcp: FastMCP) -> None:
        Args:
            url: URL of the webpage to scrape
            selector: CSS selector to target specific content (e.g., 'article', '.main-content')
-            include_links: Include extracted links in the response
-            max_length: Maximum length of extracted text (1000-500000)
+            include_links: When True, links are inlined as `[text](url)` in
+                content and also returned as a `links` list
+            max_length: Maximum length of extracted text returned in this call (1000-500000)
+            offset: Character offset into the extracted text. Use with
+                `next_offset` from a prior truncated result to paginate.
            respect_robots_txt: Whether to respect robots.txt rules (default True)

        Returns:
-            Dict with scraped content (url, title, description, content, length) or error dict
+            Dict with: url, final_url, title, description, page_type
+            (article|listing|page), content, length, offset, total_length,
+            truncated, next_offset, headings, structured_data (json_ld + open_graph),
+            and optionally links. On error, returns {"error": str, ...} with a hint when applicable.
        """
        try:
            # Validate URL
@@ -128,6 +137,10 @@ def register_tools(mcp: FastMCP) -> None:
                            "error": f"Blocked by robots.txt: {url}",
                            "url": url,
                            "skipped": True,
+                            "hint": (
+                                "Pass respect_robots_txt=False if you have "
+                                "authorization to scrape this site."
+                            ),
                        }
                except Exception:
                    pass  # If robots.txt can't be fetched, proceed anyway
@@ -195,7 +208,17 @@ def register_tools(mcp: FastMCP) -> None:
                        return {"error": "Navigation failed: no response received"}

                    if response.status != 200:
-                        return {"error": f"HTTP {response.status}: Failed to fetch URL"}
+                        hint = (
+                            "Site likely requires auth, blocks bots, or is rate-limiting."
+                            if response.status in (401, 403, 429)
+                            else "Resource may not exist or server may be down."
+                        )
+                        return {
+                            "error": f"HTTP {response.status}: Failed to fetch URL",
+                            "url": url,
+                            "status": response.status,
+                            "hint": hint,
+                        }

                    content_type = response.headers.get("content-type", "").lower()
                    if not any(t in content_type for t in ["text/html", "application/xhtml+xml"]):
@@ -218,63 +241,165 @@ def register_tools(mcp: FastMCP) -> None:

            # Parse rendered HTML with BeautifulSoup
            soup = BeautifulSoup(html_content, "html.parser")
+            base_url = str(response.url)  # Final URL after redirects
+
+            # Extract structured data BEFORE noise removal — JSON-LD lives
+            # in <script>, which gets decomposed below. JSON-LD is often the
+            # cleanest source of structured info on listing pages.
+            json_ld: list[Any] = []
+            for script in soup.find_all("script", type="application/ld+json"):
+                raw = script.string or script.get_text() or ""
+                if raw.strip():
+                    try:
+                        json_ld.append(json.loads(raw))
+                    except (json.JSONDecodeError, TypeError):
+                        pass
+
+            open_graph: dict[str, str] = {}
+            for meta in soup.find_all("meta"):
+                prop = (meta.get("property") or "").strip()
+                if prop.startswith("og:"):
+                    val = (meta.get("content") or "").strip()
+                    if val:
+                        open_graph[prop[3:]] = val

            # Remove noise elements
            for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "iframe"]):
                tag.decompose()

-            # Get title and description
+            # Get title and description (fall back to OG description)
            title = soup.title.get_text(strip=True) if soup.title else ""
-
            description = ""
            meta_desc = soup.find("meta", attrs={"name": "description"})
            if meta_desc:
-                description = meta_desc.get("content", "")
+                description = meta_desc.get("content", "") or ""
+            if not description:
+                description = open_graph.get("description", "")

-            # Target content
+            # Headings outline (capped) — lets the agent drill in via selector
+            headings: list[dict[str, Any]] = []
+            for h in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
+                h_text = h.get_text(strip=True)
+                if h_text:
+                    headings.append({"level": int(h.name[1]), "text": h_text})
+                if len(headings) >= 100:
+                    break
+
+            # Page-type heuristic: many <article> blocks → listing page
+            article_count = len(soup.find_all("article"))
+            if article_count >= 3:
+                page_type = "listing"
+            elif article_count == 1 or soup.find("main"):
+                page_type = "article"
+            else:
+                page_type = "page"
+
+            # Locate target subtree
            if selector:
                content_elem = soup.select_one(selector)
                if not content_elem:
-                    return {"error": f"No elements found matching selector: {selector}"}
-                text = content_elem.get_text(separator=" ", strip=True)
+                    return {
+                        "error": f"No elements found matching selector: {selector}",
+                        "url": url,
+                        "hint": "Try a broader selector or omit selector to use auto-detection.",
+                    }
            else:
-                # Auto-detect main content
-                main_content = (
-                    soup.find("article")
-                    or soup.find("main")
+                # Prefer <main> over the first <article> — on listing pages
+                # the latter would drop every article after the first.
+                content_elem = (
+                    soup.find("main")
                    or soup.find(attrs={"role": "main"})
+                    or soup.find("article")
                    or soup.find(class_=["content", "post", "entry", "article-body"])
                    or soup.find("body")
                )
-                text = main_content.get_text(separator=" ", strip=True) if main_content else ""

-            # Clean up whitespace
-            text = " ".join(text.split())
+            # Collect link metadata BEFORE rewriting anchors (rewriting
+            # replaces <a> elements with NavigableStrings, so find_all('a')
+            # would miss them after).
+            links: list[dict[str, str]] = []
+            if content_elem and include_links:
+                for a in content_elem.find_all("a", href=True)[:50]:
+                    link_text = a.get_text(strip=True)
+                    href = urljoin(base_url, a["href"])
+                    if link_text and href:
+                        links.append({"text": link_text, "href": href})

-            # Truncate if needed (reserve 3 chars for the ellipsis so the
-            # final string stays within max_length)
-            if len(text) > max_length:
-                text = text[: max_length - 3] + "..."
+            text = ""
+            if content_elem:
+                # Inline anchors as [text](url) so links survive text
+                # extraction (otherwise the agent has to correlate `links`
+                # against the text blob).
+                if include_links:
+                    for a in content_elem.find_all("a", href=True):
+                        link_text = a.get_text(strip=True)
+                        if link_text:
+                            href = urljoin(base_url, a["href"])
+                            a.replace_with(NavigableString(f"[{link_text}]({href})"))
+
+                # Convert <br> and block elements into newlines so the output
+                # preserves paragraph/list/heading structure rather than
+                # collapsing into one giant whitespace-joined string.
+                for br in content_elem.find_all("br"):
+                    br.replace_with(NavigableString("\n"))
+                block_tags = (
+                    "p", "h1", "h2", "h3", "h4", "h5", "h6",
+                    "li", "tr", "div", "section", "article", "blockquote",
+                )
+                for block in content_elem.find_all(block_tags):
+                    block.insert_before(NavigableString("\n"))
+                    block.append(NavigableString("\n"))
+
+                raw_text = content_elem.get_text(separator=" ")
+
+                # Normalize: squash spaces within each line, collapse runs of
+                # blank lines to a single blank, trim.
+                cleaned: list[str] = []
+                blank = True  # swallow leading blanks
+                for line in raw_text.split("\n"):
+                    line = re.sub(r"[ \t]+", " ", line).strip()
+                    if line:
+                        cleaned.append(line)
+                        blank = False
+                    elif not blank:
+                        cleaned.append("")
+                        blank = True
+                text = "\n".join(cleaned).strip()
+
+            # Apply offset/truncation with continuation metadata. Reserve 3
+            # chars for the ellipsis so the returned string stays within
+            # max_length (back-compat with existing test expectations).
+            total_length = len(text)
+            offset = max(0, min(offset, total_length))
+            end = offset + max_length
+            truncated = end < total_length
+            sliced = text[offset:end]
+            if truncated and len(sliced) >= 3:
+                sliced = sliced[: -3] + "..."
+
+            structured_data: dict[str, Any] = {}
+            if json_ld:
+                structured_data["json_ld"] = json_ld
+            if open_graph:
+                structured_data["open_graph"] = open_graph

            result: dict[str, Any] = {
                "url": url,
+                "final_url": base_url,
                "title": title,
                "description": description,
-                "content": text,
-                "length": len(text),
+                "page_type": page_type,
+                "content": sliced,
+                "length": len(sliced),
+                "offset": offset,
+                "total_length": total_length,
+                "truncated": truncated,
+                "next_offset": end if truncated else None,
+                "headings": headings,
            }
-
-            # Extract links if requested
+            if structured_data:
+                result["structured_data"] = structured_data
            if include_links:
-                links: list[dict[str, str]] = []
-                base_url = str(response.url)  # Use final URL after redirects
-                for a in soup.find_all("a", href=True)[:50]:
-                    href = a["href"]
-                    # Convert relative URLs to absolute URLs
-                    absolute_href = urljoin(base_url, href)
-                    link_text = a.get_text(strip=True)
-                    if link_text and absolute_href:
-                        links.append({"text": link_text, "href": absolute_href})
                result["links"] = links

            return result
@@ -41,7 +41,7 @@ def register_tools(mcp: FastMCP) -> None:
    """Register all GCU browser tools with the MCP server.

    Tools are organized into categories:
-    - Lifecycle: browser_start, browser_stop, browser_status
+    - Lifecycle: browser_setup, browser_status, browser_stop (browser_open lazy-creates the context)
    - Tabs: browser_tabs, browser_open, browser_close, browser_activate_tab
    - Navigation: browser_navigate, browser_go_back, browser_go_forward, browser_reload
    - Inspection: browser_screenshot, browser_snapshot, browser_console
@@ -642,7 +642,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_snapshot", params, result=result)
            return result

@@ -727,7 +727,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_html", params, result=result)
            return result

@@ -153,7 +153,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_click", params, result=result)
            return result

@@ -247,7 +247,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_click_coordinate", params, result=result)
            return _text_only(result)

@@ -352,7 +352,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_type", params, result=result)
            return result

@@ -432,7 +432,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_type_focused", params, result=result)
            return result

@@ -506,7 +506,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_press", params, result=result)
            return result

@@ -560,7 +560,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_hover", params, result=result)
            return result

@@ -627,7 +627,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_hover_coordinate", params, result=result)
            return _text_only(result)

@@ -712,7 +712,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_press_at", params, result=result)
            return _text_only(result)

@@ -782,7 +782,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_select", params, result=result)
            return result

@@ -860,7 +860,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_scroll", params, result=result)
            return result

@@ -924,7 +924,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_drag", params, result=result)
            return result

@@ -61,7 +61,7 @@ async def _ensure_context(
    Lazy-creates the browser context (tab group + seed tab) the first time
    a profile is used so URL-taking tools (``browser_open`` /
    ``browser_navigate``) can be the agent's single cold-start entry
-    point instead of forcing an explicit ``browser_start`` round trip.
+    point — no separate "start" tool to remember.

    Caller must verify ``bridge`` is connected first; any failure in
    ``bridge.create_context`` propagates so the caller's existing
@@ -137,7 +137,7 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
            return {
                "ok": True,
                "connected": True,
-                "status": "Extension is connected and ready. Call browser_start to begin.",
+                "status": "Extension is connected and ready. Call browser_open(url) to begin.",
            }

        return {
@@ -150,7 +150,7 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
                "step_3": "Click 'Load unpacked'",
                "step_4": f"Select this directory: {ext_path}",
                "step_5": ("Click the extension icon in the Chrome toolbar to confirm it says 'Connected'"),
-                "step_6": "Return here and call browser_start",
+                "step_6": "Return here and call browser_open(url) to begin",
            },
            "extensionPath": ext_path,
            "extensionPathExists": ext_exists,
@@ -238,63 +238,6 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
        )
        return result

-    @mcp.tool()
-    async def browser_start(profile: str | None = None) -> dict:
-        """
-        Explicitly create a browser context (tab group) for ``profile``.
-
-        Most workflows do NOT need to call this directly: ``browser_open``
-        and ``browser_navigate`` lazy-create a context on first use, so a
-        single ``browser_open(url)`` covers the cold path. Reach for
-        ``browser_start`` when you want to (a) warm a profile without
-        opening a URL yet, or (b) recreate a context after
-        ``browser_stop`` to clear stale state.
-
-        No separate browser process is launched — uses the user's
-        existing Chrome via the Beeline extension.
-
-        Args:
-            profile: Browser profile name (default: "default")
-
-        Returns:
-            Dict with start status (``"started"`` on fresh creation,
-            ``"already_running"`` when a context for the profile exists),
-            including ``groupId`` and ``activeTabId``.
-        """
-        start = time.perf_counter()
-        params = {"profile": profile}
-
-        bridge = get_bridge()
-        if not bridge or not bridge.is_connected:
-            result = {
-                "ok": False,
-                "error": ("Browser extension not connected. Call browser_setup for installation instructions."),
-            }
-            log_tool_call("browser_start", params, result=result)
-            return result
-
-        try:
-            profile_name, ctx, created = await _ensure_context(bridge, profile)
-            result = {
-                "ok": True,
-                "status": "started" if created else "already_running",
-                "profile": profile_name,
-                "groupId": ctx.get("groupId"),
-                "activeTabId": ctx.get("activeTabId"),
-            }
-            log_tool_call(
-                "browser_start",
-                params,
-                result=result,
-                duration_ms=(time.perf_counter() - start) * 1000,
-            )
-            return result
-        except Exception as e:
-            logger.exception("Failed to start browser context")
-            result = {"ok": False, "error": str(e)}
-            log_tool_call("browser_start", params, error=e, duration_ms=(time.perf_counter() - start) * 1000)
-            return result
-
    @mcp.tool()
    async def browser_stop(profile: str | None = None) -> dict:
        """
@@ -33,11 +33,10 @@ def register_navigation_tools(mcp: FastMCP) -> None:
        """
        Navigate a tab to a URL.

-        Lazy-creates a browser context if none exists (no need to call
-        ``browser_start`` first); when no ``tab_id`` is given and the
-        context was just created, navigation lands on the seed tab.
-        Prefer ``browser_open`` when you specifically want a new tab —
-        ``browser_navigate`` is for redirecting an existing tab.
+        Lazy-creates a browser context if none exists; when no ``tab_id``
+        is given and the context was just created, navigation lands on
+        the seed tab. Prefer ``browser_open`` when you specifically want
+        a new tab — ``browser_navigate`` is for redirecting an existing tab.

        Waits for the page to reach the ``wait_until`` condition before
        returning.
@@ -130,7 +129,7 @@ def register_navigation_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_go_back", params, result=result)
            return result

@@ -180,7 +179,7 @@ def register_navigation_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_go_forward", params, result=result)
            return result

@@ -235,7 +234,7 @@ def register_navigation_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_reload", params, result=result)
            return result

@@ -65,7 +65,7 @@ def register_tab_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_tabs", params, result=result)
            return result

@@ -100,12 +100,12 @@ def register_tab_tools(mcp: FastMCP) -> None:
        """
        Open a browser tab at the given URL — preferred entry point.

-        This is the agent's primary "go to a page" tool. If no browser
-        context exists yet for the profile, one is created transparently
-        (no need to call ``browser_start`` first). The first call after
-        a fresh context reuses the seed ``about:blank`` tab; subsequent
-        calls open new tabs in the agent's tab group. Waits for the
-        page to load before returning.
+        This is the agent's primary "go to a page" tool and the cold-start
+        entry point — if no browser context exists yet for the profile,
+        one is created transparently. The first call after a fresh
+        context reuses the seed ``about:blank`` tab; subsequent calls
+        open new tabs in the agent's tab group. Waits for the page to
+        load before returning.

        Args:
            url: URL to navigate to
@@ -192,7 +192,7 @@ def register_tab_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_close", params, result=result)
            return result

@@ -271,7 +271,7 @@ def register_tab_tools(mcp: FastMCP) -> None:

        ctx = _get_context(profile)
        if not ctx:
-            result = {"ok": False, "error": "Browser not started. Call browser_start first."}
+            result = {"ok": False, "error": "Browser not started. Call browser_open(url) first to open a tab."}
            log_tool_call("browser_activate_tab", params, result=result)
            return result

@@ -107,22 +107,17 @@ class TestMultipleSubagentsTabGroups:

        mock_bridge.create_context = AsyncMock(side_effect=mock_create_context)

-        # Register tools first
-        register_lifecycle_tools(mcp)
-        browser_start = mcp._tool_manager._tools["browser_start"].fn
+        from gcu.browser.tools.lifecycle import _ensure_context

-        # Now patch for execution
-        with patch("gcu.browser.tools.lifecycle.get_bridge", return_value=mock_bridge):
-            # Simulate 3 different subagents starting browsers
        results = await asyncio.gather(
-                browser_start(profile="agent_1"),
-                browser_start(profile="agent_2"),
-                browser_start(profile="agent_3"),
+            _ensure_context(mock_bridge, "agent_1"),
+            _ensure_context(mock_bridge, "agent_2"),
+            _ensure_context(mock_bridge, "agent_3"),
        )

        # Each should have created a separate context
        assert mock_bridge.create_context.call_count == 3
-        assert all(r.get("ok") for r in results)
+        assert all(created for (_, _, created) in results)

    @pytest.mark.asyncio
    async def test_concurrent_tab_operations_different_groups(self, mcp: FastMCP, mock_bridge: MagicMock):
@@ -709,11 +704,11 @@ class TestErrorHandling:
        mock_bridge = MagicMock(spec=BeelineBridge)
        mock_bridge.is_connected = False

-        register_lifecycle_tools(mcp)
-        browser_start = mcp._tool_manager._tools["browser_start"].fn
+        register_tab_tools(mcp)
+        browser_open = mcp._tool_manager._tools["browser_open"].fn

-        with patch("gcu.browser.tools.lifecycle.get_bridge", return_value=mock_bridge):
-            result = await browser_start(profile="test")
+        with patch("gcu.browser.tools.tabs.get_bridge", return_value=mock_bridge):
+            result = await browser_open(url="https://example.com", profile="test")

        assert result.get("ok") is False
        assert "not connected" in result.get("error", "").lower()
@@ -374,6 +374,190 @@ class TestWebScrapeToolLinkConversion:
        assert len([t for t in texts if not t.strip()]) == 0


+class TestWebScrapeToolAIFriendlyOutput:
+    """Tests for the AI-friendly output additions: structured data,
+    headings, page_type, block-level newlines, inline links, truncation
+    metadata, and offset-based pagination."""
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_block_level_newlines_preserved(self, mock_pw, mock_stealth, web_scrape_fn):
+        """Block elements (p, h1, li) produce newlines, not space-collapsed."""
+        html = """
+        <html><body>
+            <h1>Title</h1>
+            <p>First paragraph.</p>
+            <p>Second paragraph.</p>
+            <ul><li>Item one</li><li>Item two</li></ul>
+        </body></html>
+        """
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com")
+        assert "error" not in result
+        content = result["content"]
+        assert "Title" in content
+        assert "First paragraph." in content
+        assert "Second paragraph." in content
+        # Block separation should produce newlines, not run paragraphs together
+        assert "First paragraph.\n" in content or "First paragraph.\n\nSecond" in content
+        assert "Item one" in content and "Item two" in content
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_headings_outline_returned(self, mock_pw, mock_stealth, web_scrape_fn):
+        """Headings outline lists h1-h6 with level + text."""
+        html = """
+        <html><body>
+            <h1>Top</h1>
+            <h2>Section A</h2>
+            <h3>Sub A1</h3>
+        </body></html>
+        """
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com")
+        assert result["headings"] == [
+            {"level": 1, "text": "Top"},
+            {"level": 2, "text": "Section A"},
+            {"level": 3, "text": "Sub A1"},
+        ]
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_inline_links_when_include_links(self, mock_pw, mock_stealth, web_scrape_fn):
+        """include_links=True inlines anchors as [text](url) in content."""
+        html = """
+        <html><body>
+            <p>See <a href="/docs">our docs</a> for details.</p>
+        </body></html>
+        """
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com", include_links=True)
+        assert "[our docs](https://example.com/docs)" in result["content"]
+        # Separate links list still present for back-compat
+        assert any(link["text"] == "our docs" for link in result["links"])
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_structured_data_json_ld(self, mock_pw, mock_stealth, web_scrape_fn):
+        """JSON-LD blocks are parsed and surfaced under structured_data."""
+        html = """
+        <html><head>
+            <script type="application/ld+json">
+            {"@type": "Article", "headline": "Hello"}
+            </script>
+        </head><body><p>body</p></body></html>
+        """
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com")
+        assert "structured_data" in result
+        assert result["structured_data"]["json_ld"] == [
+            {"@type": "Article", "headline": "Hello"}
+        ]
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_structured_data_open_graph(self, mock_pw, mock_stealth, web_scrape_fn):
+        """OpenGraph meta tags are surfaced under structured_data.open_graph."""
+        html = """
+        <html><head>
+            <meta property="og:title" content="OG Title">
+            <meta property="og:type" content="article">
+        </head><body><p>body</p></body></html>
+        """
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com")
+        assert result["structured_data"]["open_graph"] == {
+            "title": "OG Title",
+            "type": "article",
+        }
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_truncation_metadata(self, mock_pw, mock_stealth, web_scrape_fn):
+        """Truncated responses set truncated/total_length/next_offset."""
+        html = f"<html><body>{'a' * 5000}</body></html>"
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com", max_length=1000)
+        assert result["truncated"] is True
+        assert result["total_length"] == 5000
+        assert result["next_offset"] == 1000
+        assert result["offset"] == 0
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_offset_pagination(self, mock_pw, mock_stealth, web_scrape_fn):
+        """offset arg returns content starting from the given character."""
+        body = "a" * 1000 + "b" * 1000 + "c" * 1000
+        html = f"<html><body>{body}</body></html>"
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com", max_length=1000, offset=1000)
+        assert result["offset"] == 1000
+        # Window should start in the b-region
+        assert result["content"].startswith("b")
+        assert result["truncated"] is True
+        assert result["next_offset"] == 2000
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_page_type_listing(self, mock_pw, mock_stealth, web_scrape_fn):
+        """3+ <article> elements => page_type 'listing'."""
+        html = """
+        <html><body>
+            <article><h2>Post 1</h2></article>
+            <article><h2>Post 2</h2></article>
+            <article><h2>Post 3</h2></article>
+        </body></html>
+        """
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com")
+        assert result["page_type"] == "listing"
+
+    @pytest.mark.asyncio
+    @patch(_STEALTH_PATH)
+    @patch(_PW_PATH)
+    async def test_page_type_article(self, mock_pw, mock_stealth, web_scrape_fn):
+        """Single <article> => page_type 'article'."""
+        html = "<html><body><article><p>Hello</p></article></body></html>"
+        mock_cm, _, _ = _make_playwright_mocks(html, final_url="https://example.com")
+        mock_pw.return_value = mock_cm
+        mock_stealth.return_value.apply_stealth_async = AsyncMock()
+
+        result = await web_scrape_fn(url="https://example.com")
+        assert result["page_type"] == "article"
+
+
 class TestWebScrapeToolErrorHandling:
    """Tests for error handling and early exit before JS wait."""

@@ -388,7 +572,9 @@ class TestWebScrapeToolErrorHandling:
        mock_stealth.return_value.apply_stealth_async = AsyncMock()

        result = await web_scrape_fn(url="https://example.com/missing")
-        assert result == {"error": "HTTP 404: Failed to fetch URL"}
+        assert result["error"] == "HTTP 404: Failed to fetch URL"
+        assert result["status"] == 404
+        assert "hint" in result
        mock_page.wait_for_load_state.assert_not_called()

    @pytest.mark.asyncio