From aba0ff07ba42e5d0dbe259b90d02852bdd95afd0 Mon Sep 17 00:00:00 2001 From: Timothy Date: Thu, 16 Apr 2026 20:29:05 -0700 Subject: [PATCH 1/2] fix: model invariant screenshot --- .claude/settings.json | 13 +- .mcp.json | 9 +- .../agents/queen/reference/gcu_guide.md | 5 +- core/framework/orchestrator/gcu.py | 39 +- .../browser-automation/SKILL.md | 54 +-- .../linkedin-automation/SKILL.md | 2 +- tools/src/gcu/browser/bridge.py | 104 ++++-- tools/src/gcu/browser/tools/advanced.py | 10 + tools/src/gcu/browser/tools/inspection.py | 335 ++++++++---------- tools/src/gcu/browser/tools/interactions.py | 67 ++-- tools/src/gcu/server.py | 5 +- 11 files changed, 320 insertions(+), 323 deletions(-) diff --git a/.claude/settings.json b/.claude/settings.json index fbdc243f..1b61758d 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -44,7 +44,18 @@ "WebFetch(domain:docs.litellm.ai)", "Bash(cat /home/timothy/aden/hive/.venv/lib/python3.11/site-packages/litellm-*.dist-info/METADATA)", "Bash(find \"/home/timothy/.hive/agents/queens/queen_brand_design/sessions/session_20260415_100751_d49f4c28/\" -type f -name \"*.json*\" -exec grep -l \"协日\" {} \\\\;)", - "Bash(grep -v ':0$')" + "Bash(grep -v ':0$')", + "Bash(curl -s -m 2 http://127.0.0.1:4002/sse -o /dev/null -w 'status=%{http_code} time=%{time_total}s\\\\n')", + "mcp__gcu-tools__browser_status", + "mcp__gcu-tools__browser_start", + "mcp__gcu-tools__browser_navigate", + "mcp__gcu-tools__browser_evaluate", + "mcp__gcu-tools__browser_screenshot", + "mcp__gcu-tools__browser_open", + "mcp__gcu-tools__browser_click_coordinate", + "mcp__gcu-tools__browser_get_rect", + "mcp__gcu-tools__browser_type_focused", + "mcp__gcu-tools__browser_wait" ], "additionalDirectories": [ "/home/timothy/.hive/skills/writing-hive-skills", diff --git a/.mcp.json b/.mcp.json index da39e4ff..b37e500c 100644 --- a/.mcp.json +++ b/.mcp.json @@ -1,3 +1,10 @@ { - "mcpServers": {} + "mcpServers": { + "gcu-tools": { + "type": "stdio", + "command": "uv", + "args": ["run", "python", "-m", "gcu.server", "--stdio"], + "cwd": "/home/timothy/aden/hive/tools" + } + } } diff --git a/core/framework/agents/queen/reference/gcu_guide.md b/core/framework/agents/queen/reference/gcu_guide.md index e0b8bd1d..33b7894c 100644 --- a/core/framework/agents/queen/reference/gcu_guide.md +++ b/core/framework/agents/queen/reference/gcu_guide.md @@ -25,7 +25,6 @@ All tools are prefixed with `browser_`: - `browser_screenshot` — visual capture (annotated PNG) - `browser_shadow_query`, `browser_get_rect` — locate elements (shadow-piercing via `>>>`) -- `browser_coords` — convert image pixels to CSS pixels (always use `css_x/y`, never `physical_x/y`) - `browser_scroll`, `browser_wait` — navigation helpers - `browser_evaluate` — run JavaScript - `browser_close`, `browser_close_finished` — tab cleanup @@ -38,9 +37,9 @@ All tools are prefixed with `browser_`: Neither tool is "preferred" universally — they're for different jobs. Default to snapshot on text-heavy static pages, screenshot on SPAs and anything shadow-DOM-heavy. Activate the `browser-automation` skill for the full decision tree. -## Coordinate rule: always CSS pixels +## Coordinate rule -Chrome DevTools Protocol `Input.dispatchMouseEvent` takes **CSS pixels**, not physical pixels. After a screenshot, use `browser_coords(image_x, image_y)` and feed the returned `css_x/y` (NOT `physical_x/y`) to `browser_click_coordinate`, `browser_hover_coordinate`, `browser_press_at`. Feeding physical pixels on a HiDPI display (DPR=1.6, 2, or 3) overshoots by `DPR×` and clicks land in the wrong place. `getBoundingClientRect()` already returns CSS pixels — pass through unchanged, no DPR multiplication. +Every browser tool that takes or returns coordinates operates in **screenshot pixels** (the 800 px wide JPEG `browser_screenshot` delivers). Read a pixel off the image, pass it straight to `browser_click_coordinate` / `browser_hover_coordinate` / `browser_press_at`. `browser_get_rect` and `browser_shadow_query` return `rect.cx` / `rect.cy` in the same space. The tools translate to CSS px internally — no scale awareness required. Avoid raw `getBoundingClientRect()` via `browser_evaluate` for coord lookup; use `browser_get_rect` instead. ## System prompt tips for browser nodes diff --git a/core/framework/orchestrator/gcu.py b/core/framework/orchestrator/gcu.py index 43ce1fff..6951ce61 100644 --- a/core/framework/orchestrator/gcu.py +++ b/core/framework/orchestrator/gcu.py @@ -42,25 +42,21 @@ after an interaction unless you need a fresh view. Only fall back to `browser_get_text` for extracting small elements by CSS selector. -## Coordinates: always CSS pixels +## Coordinates -Chrome DevTools Protocol `Input.dispatchMouseEvent` takes **CSS -pixels**, not physical pixels. This is critical and often gets wrong: +Every browser tool that takes or returns coordinates operates in +**screenshot pixels** (the 800 px wide JPEG `browser_screenshot` +delivers). Read a pixel off the image, pass it to +`browser_click_coordinate` / `browser_hover_coordinate` / +`browser_press_at`. `browser_get_rect` and `browser_shadow_query` +return `rect.cx` / `rect.cy` in the same space. The tools handle the +image-px → CSS-px translation internally; you do not need to know +about CSS pixels, DPR, or any scale factor. -| Tool | Unit | -|---|---| -| `browser_click_coordinate(x, y)` | **CSS pixels** | -| `browser_hover_coordinate(x, y)` | **CSS pixels** | -| `browser_press_at(x, y, key)` | **CSS pixels** | -| `getBoundingClientRect()` | already CSS pixels — pass straight through | -| `browser_coords(img_x, img_y)` | returns `css_x/y` (use this) and `physical_x/y` (debug only) | - -**Always use `css_x/y`** from `browser_coords`. Feeding `physical_x/y` -on a HiDPI display overshoots by `DPR×` — clicks land DPR times too -far right and down. On a DPR=1.6 display that's 60% off. - -Never multiply `getBoundingClientRect()` by `devicePixelRatio` — it's -already in the right unit. +Avoid raw `browser_evaluate` + `getBoundingClientRect()` for coord +lookup — that returns CSS px and will be mis-scaled when fed to click +tools. Prefer `browser_get_rect` / `browser_shadow_query`, which +convert for you. ## Rich-text editors (X, LinkedIn DMs, Gmail, Reddit, Slack, Discord) @@ -88,11 +84,10 @@ reach shadow elements transparently. **Shadow-heavy site workflow:** 1. `browser_screenshot()` → visual image -2. Identify target visually → image coordinate -3. `browser_coords(x, y)` → CSS px -4. `browser_click_coordinate(css_x, css_y)` → lands via native hit - test; inputs get focused regardless of shadow depth -5. Type via `browser_type_focused` (no selector needed — types into the +2. Identify target visually → pixel `(x, y)` read straight off the image +3. `browser_click_coordinate(x, y)` → lands via native hit test; + inputs get focused regardless of shadow depth +4. Type via `browser_type_focused` (no selector needed — types into the already-focused element), or `browser_type` if you have a selector For selector-style access when you know the shadow path: diff --git a/core/framework/skills/_default_skills/browser-automation/SKILL.md b/core/framework/skills/_default_skills/browser-automation/SKILL.md index 0a0e7d7d..b3896f5b 100644 --- a/core/framework/skills/_default_skills/browser-automation/SKILL.md +++ b/core/framework/skills/_default_skills/browser-automation/SKILL.md @@ -12,25 +12,20 @@ metadata: All GCU browser tools drive a real Chrome instance through the Beeline extension and Chrome DevTools Protocol (CDP). That means clicks, keystrokes, and screenshots are processed by the actual browser's native hit testing, focus, and layout engines — **not** a synthetic event layer. Understanding this unlocks strategies that make hard sites easy. -## Coordinates: always CSS pixels +## Coordinates -**Chrome DevTools Protocol `Input.dispatchMouseEvent` operates in CSS pixels, not physical pixels.** - -When you call `browser_coords(image_x, image_y)` after a screenshot, the returned dict has both `css_x/y` and `physical_x/y`. **Always use `css_x/y` for clicks, hovers, and key presses.** +Every browser tool that takes or returns coordinates operates in **screenshot pixels** (the 800 px wide JPEG `browser_screenshot` delivers). Take a screenshot, read a pixel off the image, pass that number to `browser_click_coordinate` / `browser_hover_coordinate` / `browser_press_at`. Rect-returning tools (`browser_get_rect`, `browser_shadow_query`, and the `rect` inside `focused_element`) also return screenshot pixels. You do not need to convert anything, track scale factors, or know about CSS pixels or device pixel ratio — the tools translate internally before dispatching to Chrome. ``` -browser_screenshot() → image (downscaled to 800/900 px wide) -browser_coords(img_x, img_y) → {css_x, css_y, physical_x, physical_y} -browser_click_coordinate(css_x, css_y) ← USE css_x/y -browser_hover_coordinate(css_x, css_y) ← USE css_x/y -browser_press_at(css_x, css_y, key) ← USE css_x/y +browser_screenshot() → image (800 px wide JPEG) +browser_click_coordinate(x, y) → x, y are screenshot px +browser_hover_coordinate(x, y) → x, y are screenshot px +browser_press_at(x, y, key) → x, y are screenshot px +browser_get_rect(selector) → rect → rect.cx / rect.cy are screenshot px +browser_shadow_query(...) → rect → same ``` -Feeding `physical_x/y` on a HiDPI display overshoots by DPR× — on a DPR=1.6 laptop, clicks land 60% too far right and down. The ratio between `physicalScale` and `cssScale` tells you the effective DPR. - -`getBoundingClientRect()` already returns CSS pixels — feed those values straight through to click/hover tools without any DPR multiplication. - -**Exception for zoomed elements:** pages that use `zoom` or `transform: scale()` on a container (LinkedIn's `#interop-outlet`, some embedded iframes) render in a scaled local coordinate space. `getBoundingClientRect` there may not match CDP's hit space. Use `browser_shadow_query` which handles the math, or fall back to visually picking coordinates from a screenshot. +**Exception for zoomed elements:** pages that use `zoom` or `transform: scale()` on a container (LinkedIn's `#interop-outlet`, some embedded iframes) render in a scaled local coordinate space. `getBoundingClientRect` there may not match CDP's hit space. Prefer `browser_shadow_query` (which handles the math) or visually pick coordinates from a screenshot. When in doubt, avoid raw `browser_evaluate` + `getBoundingClientRect()` for coord lookup — that returns CSS px and will be mis-scaled when passed to click tools. ## Screenshot + coordinates is shadow-agnostic — prefer it on shadow-heavy sites @@ -38,7 +33,7 @@ On sites that use Shadow DOM heavily (Reddit's faceplate Web Components, LinkedI Why: -- **CDP hit testing walks shadow roots natively.** `browser_click_coordinate(css_x, css_y)` routes through Chrome's native hit tester, which traverses open shadow roots automatically. You don't need to know the shadow structure. +- **CDP hit testing walks shadow roots natively.** `browser_click_coordinate(x, y)` routes through Chrome's native hit tester, which traverses open shadow roots automatically. You don't need to know the shadow structure. - **Keyboard dispatch follows focus** into shadow roots. After a click focuses an input (even one three shadow levels deep), `browser_press(...)` with no selector dispatches keys to `document.activeElement`'s computed focus target. - **Screenshots render the real layout** regardless of DOM implementation. @@ -46,12 +41,11 @@ Whereas `wait_for_selector`, `browser_click(selector=...)`, `browser_type(select ### Recommended workflow on shadow-heavy sites -1. `browser_screenshot()` → visual image -2. Identify the target visually → image pixel `(x, y)` (eyeball from the screenshot) -3. `browser_coords(x, y)` → convert to CSS px -4. `browser_click_coordinate(css_x, css_y)` → lands on the element via native hit testing; inputs get focused. **The response now includes `focused_element: {tag, id, role, contenteditable, rect, ...}`** — use it to verify you actually focused what you intended. -5. `browser_type_focused(text="...")` → dispatches CDP `Input.insertText` to `document.activeElement`. Shadow roots, iframes, Lexical, Draft.js, ProseMirror all just work. Use `browser_type(selector, text)` instead when you have a reliable CSS selector for a light-DOM element. -6. Verify via `browser_screenshot` OR `browser_get_attribute` on a known-reachable marker (e.g. check that the Send button's `aria-disabled` flipped to `false`). +1. `browser_screenshot()` → 800 px wide JPEG. +2. Identify the target visually → pixel `(x, y)` read straight off the image. +3. `browser_click_coordinate(x, y)` → lands on the element via native hit testing; inputs get focused. **The response includes `focused_element: {tag, id, role, contenteditable, rect, inFrame?, ...}`** — use it to verify you actually focused what you intended. `rect` is in screenshot pixels (same space as the image). When focus is inside a same-origin iframe, the descriptor reports the inner element and adds `inFrame: [...]` breadcrumbs. +4. `browser_type_focused(text="...")` → inserts text into `document.activeElement` (traverses into same-origin iframes automatically). Shadow roots, iframes, Lexical, Draft.js, ProseMirror all just work. Use `browser_type(selector, text)` instead when you have a reliable CSS selector for a light-DOM element. +5. Verify via `browser_screenshot` OR `browser_get_attribute` on a known-reachable marker (e.g. check that the Send button's `aria-disabled` flipped to `false`). ### The click→type loop (canonical pattern) @@ -80,7 +74,7 @@ browser_shadow_query("reddit-search-large >>> #search-input") browser_get_rect("#interop-outlet >>> #ember37 >>> p") ``` -Returns the element's rect in **CSS pixels** (feed directly to click tools). Remember: `browser_type` and `wait_for_selector` do **not** support `>>>` — only shadow_query and get_rect do. +Returns the element's rect in **screenshot pixels** (feed `rect.cx` / `rect.cy` directly to click tools). Remember: `browser_type` and `wait_for_selector` do **not** support `>>>` — only shadow_query and get_rect do. ## Navigation and waiting @@ -220,25 +214,15 @@ Recognized without modifiers: `Enter`, `Tab`, `Escape`, `Backspace`, `Delete`, ` ## Screenshots ``` -browser_screenshot() # viewport, 900 px wide by default +browser_screenshot() # viewport, 800 px wide JPEG browser_screenshot(full_page=True) # full scrollable page browser_screenshot(selector="#header") # clip to element's rect ``` -Returns a PNG with automatic downscaling to a target width (default 900 px) plus a JSON metadata block containing `cssWidth`, `devicePixelRatio`, `physicalScale`, `cssScale`, and a `scaleHint` string. The image is also annotated with a highlight rectangle/dot showing the last interaction (click, hover, type) if one happened on this tab. +Returns a JPEG (quality 75, ~50–120 KB) fixed at **800 px wide** — well below the vision-API resize threshold, so the model sees the exact pixels we emit. Metadata includes `imageWidth` (800), `cssWidth` (the page's real viewport width), `cssScale` (for debug only), and `physicalScale`. The image is annotated with a highlight rectangle/dot showing the last interaction (click, hover, type) if one happened on this tab. The highlight overlay stays visible on the page for **10 seconds** after each interaction, then fades. Before a screenshot is likely, make sure your click / hover / type happens <10 s before the screenshot. -### Anatomy of the scale fields - -- `cssWidth` = `window.innerWidth` (CSS px) -- `devicePixelRatio` = `window.devicePixelRatio` (often 1.6, 2, or 3 on modern displays) -- `physicalScale = png_width / image_width` (how many physical-px per image-px) -- `cssScale = cssWidth / image_width` (how many CSS-px per image-px) -- Effective DPR = `physicalScale / cssScale` (should match `devicePixelRatio`) - -When converting image coordinates for clicks, always use `cssScale`. The `physicalScale` field is there for debugging HiDPI displays, not for inputs. - ## Scrolling - Use large scroll amounts (~2000) when loading more content — sites like Twitter and LinkedIn have lazy loading for paging. @@ -363,7 +347,7 @@ Then pass the most specific selector that uniquely identifies the right input (e - **Typing into a rich-text editor without clicking first → send button stays disabled.** Draft.js (X), Lexical (Gmail, LinkedIn DMs), ProseMirror (Reddit), and React-controlled `contenteditable` elements only register input as "real" when the element received a native focus event — JS-sourced `.focus()` is not enough. `browser_type` now does this automatically via a real CDP pointer click before inserting text, but always verify the submit button's `disabled` state before clicking send. See the "ALWAYS click before typing" section above. - **Using per-character `keyDown` on Lexical / Draft.js editors → keys dispatch but text never appears.** Those editors intercept `beforeinput` and route insertion through their own state machine; raw keyDown events are silently dropped. `browser_type` now uses `Input.insertText` by default (the CDP IME-commit method) which these editors accept cleanly. Only set `use_insert_text=False` when you explicitly need per-keystroke dispatch. - **Leaving a composer with text then trying to navigate → `beforeunload` dialog hangs the bridge.** LinkedIn and several other sites pop a native "unsent message" confirm. `browser_navigate` and `close_tab` both time out against this. Always strip `window.onbeforeunload = null` via `browser_evaluate` before any navigation after typing in a composer, or wrap your logic in a `try/finally` that runs the cleanup block. -- **Clicking at physical pixels.** CDP uses CSS px. `browser_coords` returns both for debugging, but always feed `css_x/y` to click tools. +- **Click landed in the wrong region (sidebar / header instead of target).** Check `focused_element` in the click response — it's ground truth for what actually got focused, including the `inFrame` breadcrumb when focus ends up inside a same-origin iframe. If it isn't the target (e.g. `className: "msg-conversation-listitem__link"` when you meant to hit a composer), adjust the pixel and retry. Coordinates you pass are screenshot pixels; the tool translates to CSS px internally, so a wrong result means you picked the wrong pixel off the image — not that any scale went sideways. - **Calling `wait_for_selector` on a shadow element.** It'll always time out. Use `browser_shadow_query` or the screenshot + coordinate strategy. - **Relying on `innerHTML` in injected scripts on LinkedIn.** Silently discarded. Use `createElement` + `appendChild`. - **Not waiting for SPA hydration.** `wait_until="load"` fires before React/Vue rendering on many sites. Add a 2–3 s sleep before querying for chrome elements. diff --git a/core/framework/skills/_default_skills/linkedin-automation/SKILL.md b/core/framework/skills/_default_skills/linkedin-automation/SKILL.md index 844a904b..9ced68a4 100644 --- a/core/framework/skills/_default_skills/linkedin-automation/SKILL.md +++ b/core/framework/skills/_default_skills/linkedin-automation/SKILL.md @@ -34,7 +34,7 @@ LinkedIn is the hardest mainstream site to automate because it combines **shadow | Pending connection card | `.invitation-card, .invitations-card, [data-test-incoming-invitation-card]` | Filter out "invited you to follow" / "subscribe" cards | | Accept button | `button[aria-label*="Accept"]` within the card scope | Per-card scoping is critical — there are many Accept buttons on the page | -LinkedIn changes class names aggressively. If a class-based selector breaks, fall back to **`browser_screenshot` → visual identification → `browser_coords` → `browser_click_coordinate`**. The screenshot + coord path works regardless of class-name churn and regardless of shadow DOM. +LinkedIn changes class names aggressively. If a class-based selector breaks, fall back to **`browser_screenshot` → visual identification → `browser_click_coordinate`** with the pixel you read straight off the image (screenshots are CSS-sized, no conversion). The screenshot + coord path works regardless of class-name churn and regardless of shadow DOM. ## Profile Message flow (verified end-to-end 2026-04-11) diff --git a/tools/src/gcu/browser/bridge.py b/tools/src/gcu/browser/bridge.py index fcd15552..3f12159b 100644 --- a/tools/src/gcu/browser/bridge.py +++ b/tools/src/gcu/browser/bridge.py @@ -80,33 +80,57 @@ async def _adaptive_poll_sleep(elapsed_s: float) -> None: _interaction_highlights: dict[int, dict] = {} -# Compact descriptor of document.activeElement. Returned by both click() +# Compact descriptor of the focused element. Returned by both click() # and click_coordinate() so the agent can verify it focused what it -# intended, then decide whether to follow up with browser_type_focused(text=...). -# Keeping this as a single shared string avoids drift -# between the two click paths. +# intended. When the outer document's activeElement is an