Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e4330abc80 | |||
| 558813e7fa | |||
| aba0ff07ba | |||
| 4303a36df0 | |||
| e68d8ef10b | |||
| c6b6a5a2f7 | |||
| 18f5f078fc | |||
| cc6ec97a75 |
+12
-1
@@ -44,7 +44,18 @@
|
||||
"WebFetch(domain:docs.litellm.ai)",
|
||||
"Bash(cat /home/timothy/aden/hive/.venv/lib/python3.11/site-packages/litellm-*.dist-info/METADATA)",
|
||||
"Bash(find \"/home/timothy/.hive/agents/queens/queen_brand_design/sessions/session_20260415_100751_d49f4c28/\" -type f -name \"*.json*\" -exec grep -l \"协日\" {} \\\\;)",
|
||||
"Bash(grep -v ':0$')"
|
||||
"Bash(grep -v ':0$')",
|
||||
"Bash(curl -s -m 2 http://127.0.0.1:4002/sse -o /dev/null -w 'status=%{http_code} time=%{time_total}s\\\\n')",
|
||||
"mcp__gcu-tools__browser_status",
|
||||
"mcp__gcu-tools__browser_start",
|
||||
"mcp__gcu-tools__browser_navigate",
|
||||
"mcp__gcu-tools__browser_evaluate",
|
||||
"mcp__gcu-tools__browser_screenshot",
|
||||
"mcp__gcu-tools__browser_open",
|
||||
"mcp__gcu-tools__browser_click_coordinate",
|
||||
"mcp__gcu-tools__browser_get_rect",
|
||||
"mcp__gcu-tools__browser_type_focused",
|
||||
"mcp__gcu-tools__browser_wait"
|
||||
],
|
||||
"additionalDirectories": [
|
||||
"/home/timothy/.hive/skills/writing-hive-skills",
|
||||
|
||||
@@ -64,7 +64,7 @@ snapshot = await browser_snapshot(tab_id)
|
||||
|---------|--------------|-------|
|
||||
| Scroll doesn't move | Nested scroll container | Look for `overflow: scroll` divs |
|
||||
| Click no effect | Element covered | Check `getBoundingClientRect` vs viewport |
|
||||
| Type clears | Autocomplete/React | Check for event listeners on input |
|
||||
| Type clears | Autocomplete/React | Check for event listeners on input; try `browser_type_focused` |
|
||||
| Snapshot hangs | Huge DOM | Check node count in snapshot |
|
||||
| Snapshot stale | SPA hydration | Wait after navigation |
|
||||
|
||||
@@ -229,7 +229,7 @@ function queryShadow(selector) {
|
||||
|-------|-------------|----------|
|
||||
| Scroll not working | Find scrollable container | Mouse wheel at container center |
|
||||
| Click no effect | JavaScript click() | CDP mouse events |
|
||||
| Type clears | Add delay_ms | Use execCommand |
|
||||
| Type clears | Add delay_ms | Use `browser_type_focused` (Input.insertText) |
|
||||
| Snapshot hangs | Add timeout_s | DOM snapshot fallback |
|
||||
| Stale content | Wait for selector | Increase wait_until timeout |
|
||||
| Shadow DOM | Pierce selector | JavaScript traversal |
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
{
|
||||
"mcpServers": {}
|
||||
"mcpServers": {
|
||||
"gcu-tools": {
|
||||
"type": "stdio",
|
||||
"command": "uv",
|
||||
"args": ["run", "python", "-m", "gcu.server", "--stdio"],
|
||||
"cwd": "/home/timothy/aden/hive/tools"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,14 +18,13 @@ Use browser nodes (with `tools: {policy: "all"}`) when:
|
||||
|
||||
All tools are prefixed with `browser_`:
|
||||
- `browser_start`, `browser_open`, `browser_navigate` — launch/navigate
|
||||
- `browser_click`, `browser_click_coordinate`, `browser_fill`, `browser_type` — interact
|
||||
- `browser_click`, `browser_click_coordinate`, `browser_fill`, `browser_type`, `browser_type_focused` — interact
|
||||
- `browser_press` (with optional `modifiers=["ctrl"]` etc.) — keyboard shortcuts
|
||||
- `browser_snapshot` — compact accessibility-tree read (structured)
|
||||
<!-- vision-only -->
|
||||
- `browser_screenshot` — visual capture (annotated PNG)
|
||||
<!-- /vision-only -->
|
||||
- `browser_shadow_query`, `browser_get_rect` — locate elements (shadow-piercing via `>>>`)
|
||||
- `browser_coords` — convert image pixels to CSS pixels (always use `css_x/y`, never `physical_x/y`)
|
||||
- `browser_scroll`, `browser_wait` — navigation helpers
|
||||
- `browser_evaluate` — run JavaScript
|
||||
- `browser_close`, `browser_close_finished` — tab cleanup
|
||||
@@ -38,9 +37,9 @@ All tools are prefixed with `browser_`:
|
||||
|
||||
Neither tool is "preferred" universally — they're for different jobs. Default to snapshot on text-heavy static pages, screenshot on SPAs and anything shadow-DOM-heavy. Activate the `browser-automation` skill for the full decision tree.
|
||||
|
||||
## Coordinate rule: always CSS pixels
|
||||
## Coordinate rule
|
||||
|
||||
Chrome DevTools Protocol `Input.dispatchMouseEvent` takes **CSS pixels**, not physical pixels. After a screenshot, use `browser_coords(image_x, image_y)` and feed the returned `css_x/y` (NOT `physical_x/y`) to `browser_click_coordinate`, `browser_hover_coordinate`, `browser_press_at`. Feeding physical pixels on a HiDPI display (DPR=1.6, 2, or 3) overshoots by `DPR×` and clicks land in the wrong place. `getBoundingClientRect()` already returns CSS pixels — pass through unchanged, no DPR multiplication.
|
||||
Every browser tool that takes or returns coordinates operates in **fractions of the viewport (0..1 for both axes)**. Read a target's proportional position off `browser_screenshot` ("~35% from the left, ~20% from the top" → `(0.35, 0.20)`) and pass that to `browser_click_coordinate` / `browser_hover_coordinate` / `browser_press_at`. `browser_get_rect` and `browser_shadow_query` return `rect.cx` / `rect.cy` as fractions. The tools multiply by `cssWidth` / `cssHeight` internally — no scale awareness required. Fractions are used because every vision model (Claude, GPT-4o, Gemini, local VLMs) resizes/tiles images differently; proportions are invariant. Avoid raw `getBoundingClientRect()` via `browser_evaluate` for coord lookup; use `browser_get_rect` instead.
|
||||
|
||||
## System prompt tips for browser nodes
|
||||
|
||||
@@ -50,7 +49,8 @@ Chrome DevTools Protocol `Input.dispatchMouseEvent` takes **CSS pixels**, not ph
|
||||
2. For static pages (docs, forms, search results), browser_snapshot is fine.
|
||||
3. Before typing into a rich-text editor (X compose, LinkedIn DM, Gmail, Reddit),
|
||||
click the input area first with browser_click_coordinate so React / Draft.js /
|
||||
Lexical register a native focus event. Otherwise the send button stays disabled.
|
||||
Lexical register a native focus event, then use browser_type_focused(text=...)
|
||||
for shadow-DOM inputs or browser_type(selector, text) for light-DOM inputs.
|
||||
4. Use browser_wait(seconds=2-3) after navigation for SPA hydration.
|
||||
5. If you hit an auth wall, call set_output with an error and move on.
|
||||
6. Keep tool calls per turn <= 10 for reliability.
|
||||
|
||||
@@ -42,25 +42,26 @@ after an interaction unless you need a fresh view.
|
||||
Only fall back to `browser_get_text` for extracting small elements by
|
||||
CSS selector.
|
||||
|
||||
## Coordinates: always CSS pixels
|
||||
## Coordinates
|
||||
|
||||
Chrome DevTools Protocol `Input.dispatchMouseEvent` takes **CSS
|
||||
pixels**, not physical pixels. This is critical and often gets wrong:
|
||||
Every browser tool that takes or returns coordinates operates in
|
||||
**fractions of the viewport (0..1 for both axes)**. Read a target's
|
||||
proportional position off `browser_screenshot` — "this button is
|
||||
~35% from the left, ~20% from the top" → pass `(0.35, 0.20)`.
|
||||
`browser_get_rect` and `browser_shadow_query` return `rect.cx` /
|
||||
`rect.cy` as fractions in the same space. The tools handle the
|
||||
fraction → CSS-px multiplication internally; you do not need to
|
||||
track image pixels, DPR, or any scale factor.
|
||||
|
||||
| Tool | Unit |
|
||||
|---|---|
|
||||
| `browser_click_coordinate(x, y)` | **CSS pixels** |
|
||||
| `browser_hover_coordinate(x, y)` | **CSS pixels** |
|
||||
| `browser_press_at(x, y, key)` | **CSS pixels** |
|
||||
| `getBoundingClientRect()` | already CSS pixels — pass straight through |
|
||||
| `browser_coords(img_x, img_y)` | returns `css_x/y` (use this) and `physical_x/y` (debug only) |
|
||||
Why fractions: every vision model (Claude, GPT-4o, Gemini, local
|
||||
VLMs) resizes or tiles images differently before the model sees the
|
||||
pixels. Proportions survive every such transform; pixel coordinates
|
||||
only "work" per-model and break when you swap backends.
|
||||
|
||||
**Always use `css_x/y`** from `browser_coords`. Feeding `physical_x/y`
|
||||
on a HiDPI display overshoots by `DPR×` — clicks land DPR times too
|
||||
far right and down. On a DPR=1.6 display that's 60% off.
|
||||
|
||||
Never multiply `getBoundingClientRect()` by `devicePixelRatio` — it's
|
||||
already in the right unit.
|
||||
Avoid raw `browser_evaluate` + `getBoundingClientRect()` for coord
|
||||
lookup — that returns CSS px and will be wrong when fed to click
|
||||
tools. Prefer `browser_get_rect` / `browser_shadow_query`, which
|
||||
return fractions.
|
||||
|
||||
## Rich-text editors (X, LinkedIn DMs, Gmail, Reddit, Slack, Discord)
|
||||
|
||||
@@ -70,10 +71,12 @@ ProseMirror only register input as "real" after a native pointer-
|
||||
sourced focus event; JS `.focus()` is not enough. Without a real click
|
||||
first, the editor stays empty and the send button stays disabled.
|
||||
|
||||
`browser_type` now does this automatically — it clicks the element,
|
||||
then inserts text via CDP `Input.insertText` (IME-commit style), which
|
||||
rich editors accept cleanly. Before clicking send, verify the submit
|
||||
button's `disabled` / `aria-disabled` state via `browser_evaluate`.
|
||||
`browser_type` does this automatically when you have a selector — it
|
||||
clicks the element, then inserts text via CDP `Input.insertText`.
|
||||
For shadow-DOM inputs where selectors can't reach, use
|
||||
`browser_click_coordinate` to focus, then `browser_type_focused(text=...)`
|
||||
to type into the active element. Before clicking send, verify the
|
||||
submit button's `disabled` / `aria-disabled` state via `browser_evaluate`.
|
||||
|
||||
## Shadow DOM
|
||||
|
||||
@@ -86,11 +89,10 @@ reach shadow elements transparently.
|
||||
|
||||
**Shadow-heavy site workflow:**
|
||||
1. `browser_screenshot()` → visual image
|
||||
2. Identify target visually → image coordinate
|
||||
3. `browser_coords(x, y)` → CSS px
|
||||
4. `browser_click_coordinate(css_x, css_y)` → lands via native hit
|
||||
test; inputs get focused regardless of shadow depth
|
||||
5. Type via `browser_type_focused` (no selector needed — types into the
|
||||
2. Identify target visually → pixel `(x, y)` read straight off the image
|
||||
3. `browser_click_coordinate(x, y)` → lands via native hit test;
|
||||
inputs get focused regardless of shadow depth
|
||||
4. Type via `browser_type_focused` (no selector needed — types into the
|
||||
already-focused element), or `browser_type` if you have a selector
|
||||
|
||||
For selector-style access when you know the shadow path:
|
||||
|
||||
@@ -743,6 +743,18 @@ async def create_queen(
|
||||
|
||||
async def _queen_loop():
|
||||
logger.debug("[_queen_loop] Starting queen loop for session %s", session.id)
|
||||
# Scope the browser profile to this session so parallel queens each
|
||||
# drive their own Chrome tab group instead of fighting over "default".
|
||||
# Browser tools run in a stdio MCP subprocess, so we can't set a
|
||||
# contextvar across processes — instead we inject `profile` as a
|
||||
# CONTEXT_PARAM that ToolRegistry passes into every MCP call. The
|
||||
# token stays local to this task.
|
||||
try:
|
||||
from framework.loader.tool_registry import ToolRegistry
|
||||
|
||||
ToolRegistry.set_execution_context(profile=session.id)
|
||||
except Exception:
|
||||
logger.debug("Queen: failed to set browser profile for session %s", session.id, exc_info=True)
|
||||
try:
|
||||
lc = _queen_loop_config
|
||||
queen_loop_config = LoopConfig(
|
||||
|
||||
@@ -25,17 +25,6 @@ from framework.config import QUEENS_DIR
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def _stop_live_sessions(manager, keep_session_id: str | None = None) -> None:
|
||||
"""Stop live sessions so only the selected queen session remains active."""
|
||||
for session in list(manager.list_sessions()):
|
||||
if keep_session_id and session.id == keep_session_id:
|
||||
continue
|
||||
try:
|
||||
await manager.stop_session(session.id)
|
||||
except Exception:
|
||||
logger.debug("Failed to stop session %s during queen switch", session.id)
|
||||
|
||||
|
||||
def _read_queen_session_meta(queen_id: str, session_id: str) -> dict[str, Any]:
|
||||
"""Return persisted metadata for a queen session when available."""
|
||||
session_dir = QUEENS_DIR / queen_id / "sessions" / session_id
|
||||
@@ -229,10 +218,6 @@ async def handle_queen_session(request: web.Request) -> web.Response:
|
||||
}
|
||||
)
|
||||
|
||||
# Stop any live sessions bound to a different queen so only one queen
|
||||
# is active at a time.
|
||||
await _stop_live_sessions(manager)
|
||||
|
||||
# 2. Find the most recent cold session for this queen and resume it.
|
||||
# IMPORTANT: skip sessions that don't belong in the queen DM:
|
||||
# - ``colony_fork: true`` -- duplicates created by handle_colony_spawn
|
||||
@@ -323,7 +308,6 @@ async def handle_select_queen_session(request: web.Request) -> web.Response:
|
||||
|
||||
live_session = manager.get_session(target_session_id)
|
||||
if live_session is not None:
|
||||
await _stop_live_sessions(manager, keep_session_id=target_session_id)
|
||||
return web.json_response(
|
||||
{
|
||||
"session_id": live_session.id,
|
||||
@@ -332,8 +316,6 @@ async def handle_select_queen_session(request: web.Request) -> web.Response:
|
||||
}
|
||||
)
|
||||
|
||||
await _stop_live_sessions(manager)
|
||||
|
||||
meta = _read_queen_session_meta(queen_id, target_session_id)
|
||||
agent_path = meta.get("agent_path")
|
||||
initial_phase = None if agent_path else "independent"
|
||||
@@ -367,7 +349,6 @@ async def handle_new_queen_session(request: web.Request) -> web.Response:
|
||||
initial_prompt = body.get("initial_prompt")
|
||||
initial_phase = body.get("initial_phase") or "independent"
|
||||
|
||||
await _stop_live_sessions(manager)
|
||||
session = await manager.create_session(
|
||||
initial_prompt=initial_prompt,
|
||||
queen_name=queen_id,
|
||||
|
||||
@@ -671,8 +671,21 @@ class SessionManager:
|
||||
event_bus=session.event_bus,
|
||||
)
|
||||
|
||||
# Start the worker's agent loop in the background
|
||||
session.queen_task = asyncio.create_task(session.queen_executor.run(initial_message=initial_prompt))
|
||||
# Start the worker's agent loop in the background.
|
||||
# Scope browser profile per-session so parallel sessions drive
|
||||
# independent Chrome tab groups. Browser tools live in an MCP
|
||||
# subprocess; we inject `profile` via the ToolRegistry execution
|
||||
# context (a CONTEXT_PARAM) so it flows into every tool call.
|
||||
async def _run_worker():
|
||||
try:
|
||||
from framework.loader.tool_registry import ToolRegistry
|
||||
|
||||
ToolRegistry.set_execution_context(profile=session.id)
|
||||
except Exception:
|
||||
logger.debug("Worker: failed to set browser profile", exc_info=True)
|
||||
await session.queen_executor.run(initial_message=initial_prompt)
|
||||
|
||||
session.queen_task = asyncio.create_task(_run_worker())
|
||||
|
||||
# Set up event persistence
|
||||
if session.event_bus and queen_dir:
|
||||
|
||||
@@ -638,13 +638,17 @@ class TestQueenSessionSelection:
|
||||
)
|
||||
assert resp.status == 200
|
||||
data = await resp.json()
|
||||
|
||||
assert data == {
|
||||
"session_id": "queen_live",
|
||||
"queen_id": "queen_technology",
|
||||
"status": "live",
|
||||
}
|
||||
assert any(call.args == ("other_live",) for call in manager.stop_session.await_args_list)
|
||||
# Assert inside the async-with so app shutdown (which stops
|
||||
# remaining sessions as cleanup) doesn't pollute the assertions.
|
||||
assert data == {
|
||||
"session_id": "queen_live",
|
||||
"queen_id": "queen_technology",
|
||||
"status": "live",
|
||||
}
|
||||
# Other queen's live session must be left running so multiple
|
||||
# queens can stay active in parallel across navigation.
|
||||
manager.stop_session.assert_not_awaited()
|
||||
assert "other_live" in manager._sessions
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_select_queen_session_restores_specific_history_session(self, monkeypatch, tmp_path):
|
||||
@@ -745,18 +749,21 @@ class TestQueenSessionSelection:
|
||||
)
|
||||
assert resp.status == 200
|
||||
data = await resp.json()
|
||||
|
||||
assert data == {
|
||||
"session_id": "fresh_thread",
|
||||
"queen_id": "queen_technology",
|
||||
"status": "created",
|
||||
}
|
||||
manager.stop_session.assert_awaited_once_with("old_live")
|
||||
manager.create_session.assert_awaited_once_with(
|
||||
initial_prompt=None,
|
||||
queen_name="queen_technology",
|
||||
initial_phase="independent",
|
||||
)
|
||||
# Assert inside the async-with so app shutdown (which stops
|
||||
# remaining sessions as cleanup) doesn't pollute the assertions.
|
||||
assert data == {
|
||||
"session_id": "fresh_thread",
|
||||
"queen_id": "queen_technology",
|
||||
"status": "created",
|
||||
}
|
||||
# Other queen's live session must be left running.
|
||||
manager.stop_session.assert_not_awaited()
|
||||
assert "old_live" in manager._sessions
|
||||
manager.create_session.assert_awaited_once_with(
|
||||
initial_prompt=None,
|
||||
queen_name="queen_technology",
|
||||
initial_phase="independent",
|
||||
)
|
||||
|
||||
|
||||
class TestExecution:
|
||||
|
||||
@@ -12,25 +12,22 @@ metadata:
|
||||
|
||||
All GCU browser tools drive a real Chrome instance through the Beeline extension and Chrome DevTools Protocol (CDP). That means clicks, keystrokes, and screenshots are processed by the actual browser's native hit testing, focus, and layout engines — **not** a synthetic event layer. Understanding this unlocks strategies that make hard sites easy.
|
||||
|
||||
## Coordinates: always CSS pixels
|
||||
## Coordinates
|
||||
|
||||
**Chrome DevTools Protocol `Input.dispatchMouseEvent` operates in CSS pixels, not physical pixels.**
|
||||
|
||||
When you call `browser_coords(image_x, image_y)` after a screenshot, the returned dict has both `css_x/y` and `physical_x/y`. **Always use `css_x/y` for clicks, hovers, and key presses.**
|
||||
Every browser tool that takes or returns coordinates operates in **fractions of the viewport (0..1 for both axes)**. Read a target's proportional position off `browser_screenshot` — "this button is about 35% from the left and 20% from the top" → pass `(0.35, 0.20)`. Rect-returning tools (`browser_get_rect`, `browser_shadow_query`, and the `rect` inside `focused_element`) also return fractions. The tools convert to CSS pixels internally before dispatching to Chrome.
|
||||
|
||||
```
|
||||
browser_screenshot() → image (downscaled to 800/900 px wide)
|
||||
browser_coords(img_x, img_y) → {css_x, css_y, physical_x, physical_y}
|
||||
browser_click_coordinate(css_x, css_y) ← USE css_x/y
|
||||
browser_hover_coordinate(css_x, css_y) ← USE css_x/y
|
||||
browser_press_at(css_x, css_y, key) ← USE css_x/y
|
||||
browser_screenshot() → image + cssWidth/cssHeight in meta
|
||||
browser_click_coordinate(x, y) → x, y are fractions 0..1
|
||||
browser_hover_coordinate(x, y) → fractions
|
||||
browser_press_at(x, y, key) → fractions
|
||||
browser_get_rect(selector) → rect → rect.cx / rect.cy are fractions
|
||||
browser_shadow_query(...) → rect → same
|
||||
```
|
||||
|
||||
Feeding `physical_x/y` on a HiDPI display overshoots by DPR× — on a DPR=1.6 laptop, clicks land 60% too far right and down. The ratio between `physicalScale` and `cssScale` tells you the effective DPR.
|
||||
**Why fractions:** every vision model (Claude ~1.15 MP target, GPT-4o 512-px tiles, Gemini, local VLMs) resizes or tiles images differently before the model sees the pixels. Proportions survive every such transform; pixel coordinates only "work" per-model and silently break when you swap backends. Four-decimal precision (`0.0001` ≈ 0.17 CSS px on a 1717-wide viewport) is more than enough for the tightest targets.
|
||||
|
||||
`getBoundingClientRect()` already returns CSS pixels — feed those values straight through to click/hover tools without any DPR multiplication.
|
||||
|
||||
**Exception for zoomed elements:** pages that use `zoom` or `transform: scale()` on a container (LinkedIn's `#interop-outlet`, some embedded iframes) render in a scaled local coordinate space. `getBoundingClientRect` there may not match CDP's hit space. Use `browser_shadow_query` which handles the math, or fall back to visually picking coordinates from a screenshot.
|
||||
**Exception for zoomed elements:** pages that use `zoom` or `transform: scale()` on a container (LinkedIn's `#interop-outlet`, some embedded iframes) render in a scaled local coordinate space. `getBoundingClientRect` there may not match CDP's hit space. Prefer `browser_shadow_query` (which handles the math and returns fractions) or visually pick coordinates from a screenshot. Avoid raw `browser_evaluate` + `getBoundingClientRect()` for coord lookup — that returns CSS px and will be wrong when fed to click tools.
|
||||
|
||||
## Screenshot + coordinates is shadow-agnostic — prefer it on shadow-heavy sites
|
||||
|
||||
@@ -38,37 +35,28 @@ On sites that use Shadow DOM heavily (Reddit's faceplate Web Components, LinkedI
|
||||
|
||||
Why:
|
||||
|
||||
- **CDP hit testing walks shadow roots natively.** `browser_click_coordinate(css_x, css_y)` routes through Chrome's native hit tester, which traverses open shadow roots automatically. You don't need to know the shadow structure.
|
||||
- **CDP hit testing walks shadow roots natively.** `browser_click_coordinate(x, y)` routes through Chrome's native hit tester, which traverses open shadow roots automatically. You don't need to know the shadow structure.
|
||||
- **Keyboard dispatch follows focus** into shadow roots. After a click focuses an input (even one three shadow levels deep), `browser_press(...)` with no selector dispatches keys to `document.activeElement`'s computed focus target.
|
||||
- **Screenshots render the real layout** regardless of DOM implementation.
|
||||
|
||||
Whereas `wait_for_selector`, `browser_click(selector=...)`, `browser_type(selector=...)` all use `document.querySelector` under the hood, which **stops at shadow boundaries**. They cannot see elements inside shadow roots.
|
||||
Whereas `wait_for_selector`, `browser_click(selector=...)`, `browser_type(selector=...)` all use `document.querySelector` under the hood, which **stops at shadow boundaries**. They cannot see elements inside shadow roots. For shadow-DOM inputs, use `browser_type_focused` after focusing via click-coordinate.
|
||||
|
||||
### Recommended workflow on shadow-heavy sites
|
||||
|
||||
1. `browser_screenshot()` → visual image
|
||||
2. Identify the target visually → image pixel `(x, y)` (eyeball from the screenshot)
|
||||
3. `browser_coords(x, y)` → convert to CSS px
|
||||
4. `browser_click_coordinate(css_x, css_y)` → lands on the element via native hit testing; inputs get focused. **The response now includes `focused_element: {tag, id, role, contenteditable, rect, ...}`** — use it to verify you actually focused what you intended.
|
||||
5. `browser_type(text="...")` with **NO selector** → dispatches CDP `Input.insertText` to `document.activeElement`. Shadow roots, iframes, Lexical, Draft.js, ProseMirror all just work. Only pass a selector if you want a DIFFERENT element than the one you just focused (rare).
|
||||
6. Verify via `browser_screenshot` OR `browser_get_attribute` on a known-reachable marker (e.g. check that the Send button's `aria-disabled` flipped to `false`).
|
||||
1. `browser_screenshot()` → JPEG; meta includes `cssWidth`/`cssHeight` for reference.
|
||||
2. Identify the target visually → estimate its proportional position `(fx, fy)` where each is in `0..1`.
|
||||
3. `browser_click_coordinate(fx, fy)` → tool converts to CSS px and dispatches; CDP native hit testing focuses the element. **The response includes `focused_element: {tag, id, role, contenteditable, rect, inFrame?, ...}`** — use it to verify you actually focused what you intended. `rect` is in fractions (same space as your input). When focus is inside a same-origin iframe, the descriptor reports the inner element and adds `inFrame: [...]` breadcrumbs.
|
||||
4. `browser_type_focused(text="...")` → inserts text into `document.activeElement` (traverses into same-origin iframes automatically). Shadow roots, iframes, Lexical, Draft.js, ProseMirror all just work. Use `browser_type(selector, text)` instead when you have a reliable CSS selector for a light-DOM element.
|
||||
5. Verify via `browser_screenshot` OR `browser_get_attribute` on a known-reachable marker (e.g. check that the Send button's `aria-disabled` flipped to `false`).
|
||||
|
||||
### The click→type loop (canonical pattern)
|
||||
|
||||
```
|
||||
resp = browser_click_coordinate(x, y)
|
||||
fe = resp.get("focused_element")
|
||||
if fe and (fe.get("contenteditable") or fe["tag"] in ("textarea", "input")):
|
||||
browser_type(text="...") # no selector — insertText to activeElement
|
||||
else:
|
||||
# you clicked something that isn't editable — refine coords and retry
|
||||
# do NOT reach for browser_evaluate + execCommand('insertText', ...)
|
||||
# or a walk(root) shadow traversal. The problem is your click, not
|
||||
# the typing method.
|
||||
...
|
||||
```
|
||||
1. Call `browser_click_coordinate(x, y)` to click the target element.
|
||||
2. Check the `focused_element` field in the response — it tells you what actually received focus (tag, id, role, contenteditable, rect).
|
||||
3. If the focused element is editable, call `browser_type_focused(text="...")` to insert text. use tools to verify the text took effect.
|
||||
4. If it is NOT editable, your click landed on the wrong thing — refine coordinates and retry. Do NOT reach for `browser_evaluate` + `execCommand('insertText')` or shadow-root traversals. The problem is the click target, not the typing method.
|
||||
|
||||
`browser_click` (selector-based) also returns `focused_element` now, so the same check works whether you clicked by selector or coordinate.
|
||||
`browser_click` (selector-based) also returns `focused_element`, so the same check works whether you clicked by selector or coordinate.
|
||||
|
||||
### Empirically verified (2026-04-11)
|
||||
|
||||
@@ -79,13 +67,6 @@ document > reddit-search-large [shadow]
|
||||
> input[name="q"]
|
||||
```
|
||||
|
||||
- `document.querySelector('input')` → **0 visible inputs** on the page (all in shadow)
|
||||
- `browser_type('faceplate-search-input input', 'python')` → "Element not found"
|
||||
- `browser_click_coordinate(617, 28)` → focus trail: `REDDIT-SEARCH-LARGE > FACEPLATE-SEARCH-INPUT > INPUT` ✓
|
||||
- Char-by-char key dispatch after the click → `input.value === 'python'` ✓
|
||||
|
||||
Coordinate pipeline: works perfectly. Selector pipeline: unusable without shadow-piercing syntax.
|
||||
|
||||
### Shadow-piercing selectors
|
||||
|
||||
When you DO want a selector-based approach and know the shadow structure, `browser_shadow_query` and `browser_get_rect` support `>>>` shadow-piercing syntax:
|
||||
@@ -95,7 +76,7 @@ browser_shadow_query("reddit-search-large >>> #search-input")
|
||||
browser_get_rect("#interop-outlet >>> #ember37 >>> p")
|
||||
```
|
||||
|
||||
Returns the element's rect in **CSS pixels** (feed directly to click tools). Remember: `browser_type` and `wait_for_selector` do **not** support `>>>` — only shadow_query and get_rect do.
|
||||
Returns the element's rect as **fractions of the viewport** (feed `rect.cx` / `rect.cy` directly to click tools). Remember: `browser_type` and `wait_for_selector` do **not** support `>>>` — only shadow_query and get_rect do.
|
||||
|
||||
## Navigation and waiting
|
||||
|
||||
@@ -103,8 +84,8 @@ Returns the element's rect in **CSS pixels** (feed directly to click tools). Rem
|
||||
|
||||
```
|
||||
browser_navigate(url, wait_until="load") # "load" | "domcontentloaded" | "networkidle"
|
||||
browser_wait_for_selector("h1", timeout_ms=5000)
|
||||
browser_wait_for_text("Some text", timeout_ms=5000)
|
||||
browser_wait_for_selector("h1", timeout_ms=2000)
|
||||
browser_wait_for_text("Some text", timeout_ms=2000)
|
||||
browser_go_back()
|
||||
browser_go_forward()
|
||||
browser_reload()
|
||||
@@ -122,7 +103,7 @@ All return real URLs and titles. On a fast page `navigate(wait_until="load")` re
|
||||
| x.com/twitter | 1.2–1.6 s |
|
||||
| linkedin.com (logged in) | 4–5 s |
|
||||
|
||||
Use `timeout_ms=20000` for LinkedIn and other heavy SPAs to give them margin.
|
||||
For LinkedIn and other heavy SPAs, rely on `sleep()` after navigation to let the page hydrate.
|
||||
|
||||
### After navigate, always let SPA hydrate
|
||||
|
||||
@@ -131,7 +112,7 @@ Even after `wait_until="load"`, React/Vue SPAs often render their real chrome in
|
||||
### Reading pages efficiently
|
||||
|
||||
- **Prefer `browser_snapshot` over `browser_get_text("body")`** — returns a compact ~1–5 KB accessibility tree vs 100+ KB of raw HTML.
|
||||
- Interaction tools (`browser_click`, `browser_type`, `browser_fill`, `browser_scroll`, etc.) return a page snapshot automatically in their result. Use it to decide your next action — do NOT call `browser_snapshot` separately after every action. Only call `browser_snapshot` when you need a fresh view without performing an action, or after setting `auto_snapshot=false`.
|
||||
- Interaction tools (`browser_click`, `browser_type`, `browser_type_focused`, `browser_fill`, `browser_scroll`, etc.) return a page snapshot automatically in their result. Use it to decide your next action — do NOT call `browser_snapshot` separately after every action. Only call `browser_snapshot` when you need a fresh view without performing an action, or after setting `auto_snapshot=false`.
|
||||
- Complex pages (LinkedIn, Twitter/X, SPAs with virtual scrolling) have DOMs that don't match what's visually rendered — snapshot refs may be stale, missing, or misaligned with visible layout. On these pages, `browser_screenshot` is the only reliable way to orient yourself.
|
||||
- Only fall back to `browser_get_text` for extracting specific small elements by CSS selector.
|
||||
|
||||
@@ -151,44 +132,13 @@ The symptom is always the same: **you type, the characters appear visually, and
|
||||
|
||||
### Safe "click-then-type-then-verify" pattern
|
||||
|
||||
```
|
||||
# 1. Focus the real element via a real click (not JS .focus()).
|
||||
rect = browser_get_rect(selector) # or browser_shadow_query for shadow sites
|
||||
browser_click_coordinate(rect.cx, rect.cy)
|
||||
sleep(0.5) # let the editor open / focus settle
|
||||
1. **Focus** the real element via a real click (not JS `.focus()`). Use `browser_get_rect(selector)` (or `browser_shadow_query` for shadow sites) to get coordinates, then `browser_click_coordinate(cx, cy)`. Wait ~0.5 s for the editor to open and focus to settle.
|
||||
|
||||
# 2. Type. browser_type now uses CDP Input.insertText by default, which is
|
||||
# the most reliable way to insert text into rich editors (Lexical,
|
||||
# Draft.js, ProseMirror, any React-controlled contenteditable).
|
||||
browser_type(selector, text)
|
||||
sleep(1.0) # let framework state commit
|
||||
2. **Type** the text. Use `browser_type(selector, text)` for light-DOM inputs, or `browser_type_focused(text=...)` for shadow-DOM / already-focused inputs. Both use CDP `Input.insertText` by default, which is the most reliable method for rich editors (Lexical, Draft.js, ProseMirror). Wait ~500 ms for framework state to commit.
|
||||
|
||||
# 3. BEFORE clicking send, verify the submit button is actually enabled.
|
||||
# Don't trust that typing worked — check state.
|
||||
state = browser_evaluate("""
|
||||
(function(){
|
||||
const btn = document.querySelector('[data-testid="tweetButton"]');
|
||||
if (!btn) return {exists: false};
|
||||
return {
|
||||
exists: true,
|
||||
disabled: btn.disabled || btn.getAttribute('aria-disabled') === 'true',
|
||||
text: btn.textContent.trim(),
|
||||
};
|
||||
})()
|
||||
""")
|
||||
3. **Verify** the submit button is enabled before clicking it. Use `browser_evaluate` to check the button's `disabled` or `aria-disabled` attribute. Do NOT trust that typing worked — always check state.
|
||||
|
||||
# 4. Only click send if the button is enabled.
|
||||
if not state['disabled']:
|
||||
browser_click(submit_selector)
|
||||
else:
|
||||
# Recovery: sometimes a click-again + one extra keystroke nudges
|
||||
# React into recomputing hasRealContent.
|
||||
browser_click_coordinate(rect.cx, rect.cy)
|
||||
browser_press("End")
|
||||
browser_press(" ")
|
||||
browser_press("Backspace")
|
||||
# re-check state
|
||||
```
|
||||
4. **Only click send if the button is enabled.** If the button is still disabled, try the recovery dance: click the textarea again, press `End`, press a space, press `Backspace` — this forces React to recompute `hasRealContent`. Then re-check the button state.
|
||||
|
||||
### Why `browser_type` uses `Input.insertText` by default
|
||||
|
||||
@@ -224,7 +174,7 @@ Always include an equivalent cleanup block in any script that types into a compo
|
||||
| Site | Editor | Workaround |
|
||||
|---|---|---|
|
||||
| **X / Twitter** compose | Draft.js | Click `[data-testid='tweetTextarea_0']` first, then type with `delay_ms=20`. First 1-2 chars may be eaten — accept truncation or prepend a throwaway char. Verify `[data-testid='tweetButton']` has `disabled: false` before clicking. |
|
||||
| **LinkedIn** messaging | contenteditable (inside `#interop-outlet` shadow root) | Use `browser_shadow_query` to find the rect, click-coordinate to focus, then type via focus-based key dispatch (selector-based type can't reach shadow). Send button is `.msg-form__send-button`. |
|
||||
| **LinkedIn** messaging | contenteditable (inside `#interop-outlet` shadow root) | Use `browser_shadow_query` to find the rect, click-coordinate to focus, then `browser_type_focused(text=...)` (selector-based `browser_type` can't reach shadow). Send button is `.msg-form__send-button`. |
|
||||
| **LinkedIn** feed post composer | Quill/LinkedIn custom | Click the "Start a post" trigger first, wait 1s for modal, click the textarea, type. |
|
||||
| **Reddit** comment/post box | ProseMirror | Click the textarea, wait 0.5s for the toolbar to mount, then type. Submit is `button[slot="submit-button"]` inside a shreddit-composer. |
|
||||
| **Gmail** compose | Lexical | Click the body first. Gmail has a visible `div[contenteditable=true][aria-label*='Message Body']` after opening a compose window. |
|
||||
@@ -244,7 +194,7 @@ browser_type(selector, text)
|
||||
- Fires real `keydown` / `keypress` / `input` / `keyup` events — frameworks that branch on `event.key` or `event.code` see the right values
|
||||
- Matches what Playwright and Puppeteer send
|
||||
|
||||
Works on real `<input>`, `<textarea>`, and `contenteditable` elements. For shadow-DOM inputs, see the "shadow-heavy sites" section above — `type_text(selector=)` can't see past shadow boundaries.
|
||||
Works on real `<input>`, `<textarea>`, and `contenteditable` elements. For shadow-DOM inputs, see the "shadow-heavy sites" section above — `browser_type(selector=)` can't see past shadow boundaries; use `browser_type_focused` after click-coordinate focus.
|
||||
|
||||
### Keyboard shortcuts (Ctrl+A, Shift+Tab, Cmd+Enter)
|
||||
|
||||
@@ -266,25 +216,15 @@ Recognized without modifiers: `Enter`, `Tab`, `Escape`, `Backspace`, `Delete`, `
|
||||
## Screenshots
|
||||
|
||||
```
|
||||
browser_screenshot() # viewport, 900 px wide by default
|
||||
browser_screenshot(full_page=True) # full scrollable page
|
||||
browser_screenshot() # viewport, 800 px wide JPEG
|
||||
browser_screenshot(full_page=True) # full scrollable page (overview only — don't click off a full-page shot)
|
||||
browser_screenshot(selector="#header") # clip to element's rect
|
||||
```
|
||||
|
||||
Returns a PNG with automatic downscaling to a target width (default 900 px) plus a JSON metadata block containing `cssWidth`, `devicePixelRatio`, `physicalScale`, `cssScale`, and a `scaleHint` string. The image is also annotated with a highlight rectangle/dot showing the last interaction (click, hover, type) if one happened on this tab.
|
||||
Returns a JPEG (quality 75, ~50–120 KB) at 800 px wide. The pixel width is purely a bandwidth choice; all tool coordinates are fractions of the viewport and are invariant to image size. Metadata includes `imageWidth` (800), `cssWidth`, `cssHeight` (for reference), and `physicalScale`. The image is annotated with a highlight rectangle/dot showing the last interaction (click, hover, type) if one happened on this tab.
|
||||
|
||||
The highlight overlay stays visible on the page for **10 seconds** after each interaction, then fades. Before a screenshot is likely, make sure your click / hover / type happens <10 s before the screenshot.
|
||||
|
||||
### Anatomy of the scale fields
|
||||
|
||||
- `cssWidth` = `window.innerWidth` (CSS px)
|
||||
- `devicePixelRatio` = `window.devicePixelRatio` (often 1.6, 2, or 3 on modern displays)
|
||||
- `physicalScale = png_width / image_width` (how many physical-px per image-px)
|
||||
- `cssScale = cssWidth / image_width` (how many CSS-px per image-px)
|
||||
- Effective DPR = `physicalScale / cssScale` (should match `devicePixelRatio`)
|
||||
|
||||
When converting image coordinates for clicks, always use `cssScale`. The `physicalScale` field is there for debugging HiDPI displays, not for inputs.
|
||||
|
||||
## Scrolling
|
||||
|
||||
- Use large scroll amounts (~2000) when loading more content — sites like Twitter and LinkedIn have lazy loading for paging.
|
||||
@@ -340,7 +280,7 @@ Reddit's search input lives **two shadow levels deep** inside `reddit-search-lar
|
||||
|
||||
1. `browser_shadow_query("reddit-search-large >>> #search-input")` → rect
|
||||
2. `browser_click_coordinate(rect.cx, rect.cy)` → click lands on the real shadow input via native hit testing; input becomes focused
|
||||
3. `browser_press(c)` for each character → dispatches to focused element
|
||||
3. `browser_type_focused(text="query")` → dispatches to focused element via `Input.insertText`
|
||||
4. Verify by reading `.value` via `browser_evaluate` walking the shadow path
|
||||
|
||||
### X / Twitter
|
||||
@@ -409,11 +349,12 @@ Then pass the most specific selector that uniquely identifies the right input (e
|
||||
- **Typing into a rich-text editor without clicking first → send button stays disabled.** Draft.js (X), Lexical (Gmail, LinkedIn DMs), ProseMirror (Reddit), and React-controlled `contenteditable` elements only register input as "real" when the element received a native focus event — JS-sourced `.focus()` is not enough. `browser_type` now does this automatically via a real CDP pointer click before inserting text, but always verify the submit button's `disabled` state before clicking send. See the "ALWAYS click before typing" section above.
|
||||
- **Using per-character `keyDown` on Lexical / Draft.js editors → keys dispatch but text never appears.** Those editors intercept `beforeinput` and route insertion through their own state machine; raw keyDown events are silently dropped. `browser_type` now uses `Input.insertText` by default (the CDP IME-commit method) which these editors accept cleanly. Only set `use_insert_text=False` when you explicitly need per-keystroke dispatch.
|
||||
- **Leaving a composer with text then trying to navigate → `beforeunload` dialog hangs the bridge.** LinkedIn and several other sites pop a native "unsent message" confirm. `browser_navigate` and `close_tab` both time out against this. Always strip `window.onbeforeunload = null` via `browser_evaluate` before any navigation after typing in a composer, or wrap your logic in a `try/finally` that runs the cleanup block.
|
||||
- **Clicking at physical pixels.** CDP uses CSS px. `browser_coords` returns both for debugging, but always feed `css_x/y` to click tools.
|
||||
- **Click landed in the wrong region (sidebar / header instead of target).** Check `focused_element` in the click response — it's ground truth for what actually got focused, including the `inFrame` breadcrumb when focus ends up inside a same-origin iframe. If it isn't the target (e.g. `className: "msg-conversation-listitem__link"` when you meant to hit a composer), adjust the fraction and retry. Coordinates you pass are fractions of the viewport; the tool multiplies by `cssWidth` / `cssHeight` internally, so a wrong result means your estimated proportion was off — not that any scale went sideways.
|
||||
- **Accidentally passing pixels to click / hover / press_at.** The tools reject any coord outside `[-0.1, 1.5]` with a clear error. If you see that error, you passed a pixel (like 815) instead of a fraction (like 0.475). Use `browser_get_rect` to get exact fractional cx/cy, or read proportions off `browser_screenshot`.
|
||||
- **Calling `wait_for_selector` on a shadow element.** It'll always time out. Use `browser_shadow_query` or the screenshot + coordinate strategy.
|
||||
- **Relying on `innerHTML` in injected scripts on LinkedIn.** Silently discarded. Use `createElement` + `appendChild`.
|
||||
- **Not waiting for SPA hydration.** `wait_until="load"` fires before React/Vue rendering on many sites. Add a 2–3 s sleep before querying for chrome elements.
|
||||
- **Using `browser_type(selector)` on LinkedIn DMs or any shadow-DOM input.** Won't find the element. Fall back to click-to-focus + `browser_press` per character.
|
||||
- **Using `browser_type(selector)` on LinkedIn DMs or any shadow-DOM input.** Won't find the element. Use `browser_click_coordinate` to focus, then `browser_type_focused(text=...)` to type.
|
||||
- **Clicking a "Photo" / "Attach" / "Upload" button to pick a file.** This opens Chrome's NATIVE OS file picker, which is rendered outside the web page and cannot be interacted with via CDP. Your automation will hang staring at an unreachable dialog. ALWAYS use `browser_upload(selector, file_paths)` against the underlying `<input type='file'>` element — see the "File uploads" section above for the full pattern. This is the single most common way to wedge a browser session on compose-with-media flows (X/LinkedIn/Gmail).
|
||||
- **Keyboard shortcuts without the `code` field.** Chrome's shortcut dispatcher ignores keyboard events that lack a `code` or `windowsVirtualKeyCode`. `browser_press(..., modifiers=[...])` populates these automatically; raw `Input.dispatchKeyEvent` calls from `browser_evaluate` may not.
|
||||
- **Taking a screenshot more than 10s after the last interaction** and expecting the highlight to still be visible. The overlay fades after 10s. Take the screenshot sooner, or re-trigger the interaction.
|
||||
@@ -476,9 +417,8 @@ sleep(2)
|
||||
# Shadow-pierce the nested search input
|
||||
sq = browser_shadow_query("reddit-search-large >>> #search-input")
|
||||
browser_click_coordinate(sq.rect.cx, sq.rect.cy)
|
||||
# Typing can't use selector (shadow); focused input receives raw key presses
|
||||
for c in "python":
|
||||
browser_press(c)
|
||||
# Typing can't use selector (shadow); use browser_type_focused on the focused input
|
||||
browser_type_focused(text="python")
|
||||
browser_screenshot()
|
||||
browser_press("Escape")
|
||||
```
|
||||
@@ -486,7 +426,7 @@ browser_press("Escape")
|
||||
### Search LinkedIn and dismiss without submitting
|
||||
|
||||
```
|
||||
browser_navigate("https://www.linkedin.com/feed/", wait_until="load", timeout_ms=20000)
|
||||
browser_navigate("https://www.linkedin.com/feed/", wait_until="load")
|
||||
sleep(3)
|
||||
browser_wait_for_selector("input[data-testid='typeahead-input']", timeout_ms=5000)
|
||||
rect = browser_get_rect("input[data-testid='typeahead-input']")
|
||||
|
||||
@@ -13,15 +13,15 @@ metadata:
|
||||
|
||||
LinkedIn is the hardest mainstream site to automate because it combines **shadow DOM** (`#interop-outlet` for messaging), **strict Trusted Types CSP** (silently drops `innerHTML`), **heavy React reconciliation** (injected nodes get stripped on re-render), **native `beforeunload` draft dialogs** (hang the bridge), and **aggressive spam filters**. Every one of those has bit us at least once. This skill documents what actually works.
|
||||
|
||||
**Always activate `browser-automation` first.** This skill assumes you already know about CSS-px coordinates, `browser_type`'s click-first behavior, and `browser_shadow_query`. The guidance below is LinkedIn-specific; general browser rules are there.
|
||||
**Always activate `browser-automation` first.** This skill assumes you already know about CSS-px coordinates, `browser_type`/`browser_type_focused`, and `browser_shadow_query`. The guidance below is LinkedIn-specific; general browser rules are there.
|
||||
|
||||
## Timing expectations
|
||||
|
||||
- `browser_navigate(wait_until="load", timeout_ms=20000)` — LinkedIn takes **4–5 seconds** to load the feed cold. Default 30s timeout is fine; use 20s as a floor.
|
||||
- `browser_navigate(wait_until="load")` — LinkedIn takes **4–5 seconds** to load the feed cold.
|
||||
- After navigation, **always `sleep(3)`** to let React hydrate the profile/feed chrome before querying selectors. Without the sleep `wait_for_selector` will flake on elements that exist moments later.
|
||||
- Composer modal slide-in takes **~2 seconds** after you click the Message button.
|
||||
|
||||
## Verified selectors (2026-04-11)
|
||||
## Verified selectors
|
||||
|
||||
| Target | Selector | Notes |
|
||||
|---|---|---|
|
||||
@@ -34,14 +34,14 @@ LinkedIn is the hardest mainstream site to automate because it combines **shadow
|
||||
| Pending connection card | `.invitation-card, .invitations-card, [data-test-incoming-invitation-card]` | Filter out "invited you to follow" / "subscribe" cards |
|
||||
| Accept button | `button[aria-label*="Accept"]` within the card scope | Per-card scoping is critical — there are many Accept buttons on the page |
|
||||
|
||||
LinkedIn changes class names aggressively. If a class-based selector breaks, fall back to **`browser_screenshot` → visual identification → `browser_coords` → `browser_click_coordinate`**. The screenshot + coord path works regardless of class-name churn and regardless of shadow DOM.
|
||||
LinkedIn changes class names aggressively. If a class-based selector breaks, fall back to **`browser_screenshot` → visual identification → `browser_click_coordinate`** with the pixel you read straight off the image (screenshots are CSS-sized, no conversion). The screenshot + coord path works regardless of class-name churn and regardless of shadow DOM.
|
||||
|
||||
## Profile Message flow (verified end-to-end 2026-04-11)
|
||||
|
||||
```
|
||||
# 1. Load the profile
|
||||
browser_navigate("https://www.linkedin.com/in/<username>/", wait_until="load", timeout_ms=20000)
|
||||
sleep(4)
|
||||
browser_navigate("https://www.linkedin.com/in/<username>/", wait_until="load")
|
||||
sleep(3)
|
||||
|
||||
# 2. Strip onbeforeunload before any state-mutating work — prevents draft-dialog deadlock later
|
||||
browser_evaluate("""
|
||||
@@ -98,17 +98,18 @@ textarea = browser_evaluate("""
|
||||
browser_click_coordinate(textarea['cx'], textarea['cy'])
|
||||
sleep(0.6)
|
||||
|
||||
# 6. Insert text via browser_type WITHOUT a selector. This dispatches
|
||||
# CDP Input.insertText to document.activeElement — the same underlying
|
||||
# 6. Insert text via browser_type_focused. This dispatches CDP
|
||||
# Input.insertText to document.activeElement — the same underlying
|
||||
# mechanism as execCommand('insertText') but with no JSON escaping,
|
||||
# no browser_evaluate round trip, and built-in retry. The click in
|
||||
# step 5 already focused Lexical, so insertText lands in the editor
|
||||
# regardless of the shadow wrapping around #interop-outlet.
|
||||
#
|
||||
# Do NOT pass a selector here. Selector-based browser_type cannot see
|
||||
# past the #interop-outlet shadow root. No-selector mode sidesteps
|
||||
# that entirely by routing to activeElement.
|
||||
browser_type(text=message_text) # no selector — targets document.activeElement
|
||||
# Use browser_type_focused (not browser_type) here — browser_type
|
||||
# requires a selector, which cannot see past the #interop-outlet
|
||||
# shadow root. browser_type_focused targets document.activeElement
|
||||
# directly, sidestepping shadow boundaries entirely.
|
||||
browser_type_focused(text=message_text)
|
||||
sleep(1.0) # let Lexical commit state + enable Send button
|
||||
|
||||
# 7. Find the modal Send button (filter by in-viewport, reject pinned bar)
|
||||
@@ -143,7 +144,7 @@ send = browser_evaluate("""
|
||||
|
||||
# 8. ONLY click Send if it's enabled — if disabled, the insertText
|
||||
# didn't land. DO NOT retry with a different tool; the fix is
|
||||
# always: re-click the composer rect, re-run browser_type(text=...),
|
||||
# always: re-click the composer rect, re-run browser_type_focused(text=...),
|
||||
# re-check. The Send button's `disabled` state IS the ground truth —
|
||||
# if Lexical registered your text, it enables the button. If it's
|
||||
# still disabled, your text did not reach the editor, regardless
|
||||
@@ -153,7 +154,7 @@ if send['disabled']:
|
||||
# fall back to browser_type with a selector (see anti-pattern in
|
||||
# Common Pitfalls — selector-based type can't reach the shadow-DOM
|
||||
# composer). Instead: re-click the textarea rect from step 4, wait
|
||||
# a beat, re-run browser_type(text=message_text) (no selector) from
|
||||
# a beat, re-run browser_type_focused(text=message_text) from
|
||||
# step 6. If that still fails after 2 retries, bail and surface —
|
||||
# the modal may have been reclaimed by a stale state or auth wall.
|
||||
raise Exception("Send button disabled after insertText — editor did not receive input")
|
||||
@@ -170,7 +171,7 @@ Daily outbound pattern — accept pending connection requests and send a templat
|
||||
|
||||
```
|
||||
browser_navigate("https://www.linkedin.com/mynetwork/invitation-manager/received/",
|
||||
wait_until="load", timeout_ms=20000)
|
||||
wait_until="load")
|
||||
sleep(4)
|
||||
browser_evaluate("(function(){window.onbeforeunload=null;})()")
|
||||
|
||||
@@ -214,7 +215,7 @@ for card in cards[:25]:
|
||||
## Feed post composer flow
|
||||
|
||||
```
|
||||
browser_navigate("https://www.linkedin.com/feed/", wait_until="load", timeout_ms=20000)
|
||||
browser_navigate("https://www.linkedin.com/feed/", wait_until="load")
|
||||
sleep(4)
|
||||
browser_evaluate("(function(){window.onbeforeunload=null;})()")
|
||||
|
||||
@@ -301,7 +302,7 @@ If the image isn't already on disk, write it first with `write_file(absolute_pat
|
||||
|
||||
## Rate limits and safety
|
||||
|
||||
LinkedIn's abuse detection is aggressive. Respect these limits:
|
||||
LinkedIn's abuse detection is aggressive. Beware of the limits, let user know but dare to break them if user confirms.:
|
||||
|
||||
| Action | Limit |
|
||||
|---|---|
|
||||
@@ -309,8 +310,7 @@ LinkedIn's abuse detection is aggressive. Respect these limits:
|
||||
| Outbound messages to new 1st-degree connections | **25/day max**, 5–10s randomized delays |
|
||||
| Connection request sends | **100/week max**, spread across days, warm intros preferred |
|
||||
| Profile views | Several hundred/day is usually fine but varies by account age |
|
||||
| Post publications | 1–3/day, no URL-only posts |
|
||||
| Feed reactions | Dozens/day is fine; vary your activity mix |
|
||||
| Post publications | 1–5/day, no URL-only posts |
|
||||
|
||||
Signals you're being throttled:
|
||||
- "Message failed to send" with no error detail
|
||||
@@ -323,9 +323,8 @@ If any of those show up, **stop the run, screenshot the state, and surface the i
|
||||
## Common pitfalls
|
||||
|
||||
- **`innerHTML` injection is silently dropped** — LinkedIn's Trusted Types CSP discards any `innerHTML = "<...>"` from injected scripts, no console error. Always use `createElement` + `appendChild` + `setAttribute` for DOM injection. `textContent`, `style.cssText`, and `.value` assignments are fine.
|
||||
- **Do NOT pass a selector to `browser_type` on the message composer — call it with NO selector (`browser_type(text=...)`).** The Lexical contenteditable lives inside the `#interop-outlet` shadow root which `document.querySelector` (what the selector-based path uses under the hood) cannot see. Attempts to work around this with `browser_shadow_query` fail because selector-based `browser_type` doesn't support the `>>>` shadow-pierce syntax. The reliable insert path is: (1) `browser_click_coordinate` on the composer rect — the response's `focused_element` confirms Lexical received focus → (2) `browser_type(text=message_text)` with NO selector — CDP `Input.insertText` dispatches to `document.activeElement` regardless of shadow wrapping. The old `browser_evaluate` + `document.execCommand('insertText', ...)` pattern worked but had JSON-escaping pitfalls and cost ~200 chars of JS per send; `browser_type(text=...)` is the same mechanism with built-in retry.
|
||||
- **Per-char keyDown on the message composer produces empty text** — Lexical intercepts `beforeinput` and drops raw keys. Use `browser_type(text=..., use_insert_text=True)` with NO selector after click-coordinate focused the composer. The CDP `Input.insertText` method commits as if IME fired, which Lexical accepts cleanly. Do NOT pass a selector; selector-based `browser_type` can't see past `#interop-outlet`.
|
||||
- **ANTI-PATTERN: "inject a dummy `<div id='dummy-target'>` and pass it as the `selector` arg to `browser_type`".** This looks tempting but fails compoundingly: `browser_type` clicks the **dummy div's** rect (not the editor's), the click lands on the Lexical wrapper's non-editable chrome, the contenteditable never receives focus, and `Input.insertText` fires against nothing. The bridge will still return `{"ok": true, "action": "type", "length": N}` because it has no way to verify the text actually landed. Symptom: Send button stays `disabled: true` forever. Fix: `browser_click_coordinate` on the real composer rect, then `browser_type(text=message_text)` with NO selector — CDP `Input.insertText` dispatches to `document.activeElement`. (See `session_20260414_114820_08bd3c4d` for the failed dummy-div attempt.)
|
||||
- **Use `browser_type_focused` (not `browser_type`) on the message composer.** The Lexical contenteditable lives inside the `#interop-outlet` shadow root which `document.querySelector` (what `browser_type`'s selector path uses under the hood) cannot see. `browser_type` requires a selector and will fail with "Element not found". The reliable insert path is: (1) `browser_click_coordinate` on the composer rect — the response's `focused_element` confirms Lexical received focus → (2) `browser_type_focused(text=message_text)` — CDP `Input.insertText` dispatches to `document.activeElement` regardless of shadow wrapping.
|
||||
- **Per-char keyDown on the message composer produces empty text** — Lexical intercepts `beforeinput` and drops raw keys. Use `browser_type_focused(text=..., use_insert_text=True)` after click-coordinate focused the composer. The CDP `Input.insertText` method commits as if IME fired, which Lexical accepts cleanly.
|
||||
- **Multiple Send buttons on the page** — the pinned bottom-right messaging bar has its own `msg-form__send-button` that's usually below `innerHeight`. Filter by in-viewport before clicking.
|
||||
- **`window.onbeforeunload` hangs navigation/close** — after typing in a composer, any `browser_navigate` or `close_tab` can pop a native "unsent message, leave?" confirm dialog that deadlocks the bridge. Always strip `onbeforeunload` before any navigation, and wrap composer flows in a `try/finally` that runs the cleanup block:
|
||||
|
||||
@@ -346,7 +345,7 @@ browser_evaluate("""
|
||||
|
||||
## Auth wall detection
|
||||
|
||||
If you see a "Log in" / "Join LinkedIn" prompt instead of the logged-in feed, **stop immediately** and surface the issue. Do NOT attempt to log in via automation — LinkedIn's bot detection will flag the account.
|
||||
If you see a "Log in" / "Join LinkedIn" prompt instead of the logged-in feed, **stop immediately** and surface the issue to user. Do NOT attempt to log in via automation — LinkedIn's bot detection will flag the account.
|
||||
|
||||
Check via:
|
||||
```
|
||||
|
||||
@@ -15,7 +15,10 @@ import { useColony } from "@/context/ColonyContext";
|
||||
|
||||
export default function Sidebar() {
|
||||
const navigate = useNavigate();
|
||||
const { colonies, queenProfiles, sidebarCollapsed, setSidebarCollapsed } = useColony();
|
||||
const { colonies, queens, queenProfiles, sidebarCollapsed, setSidebarCollapsed } = useColony();
|
||||
const activeQueenIds = new Set(
|
||||
queens.filter((q) => q.status === "online").map((q) => q.id),
|
||||
);
|
||||
const [coloniesExpanded, setColoniesExpanded] = useState(true);
|
||||
const [queensExpanded, setQueensExpanded] = useState(true);
|
||||
|
||||
@@ -148,7 +151,11 @@ export default function Sidebar() {
|
||||
{queensExpanded && (
|
||||
<div className="flex flex-col gap-0.5 mt-0.5">
|
||||
{queenProfiles.map((queen) => (
|
||||
<SidebarQueenItem key={queen.id} queen={queen} />
|
||||
<SidebarQueenItem
|
||||
key={queen.id}
|
||||
queen={queen}
|
||||
isActive={activeQueenIds.has(queen.id)}
|
||||
/>
|
||||
))}
|
||||
{queenProfiles.length === 0 && (
|
||||
<p className="px-5 py-2 text-xs text-sidebar-muted">
|
||||
|
||||
@@ -3,22 +3,29 @@ import type { QueenProfileSummary } from "@/types/colony";
|
||||
|
||||
interface SidebarQueenItemProps {
|
||||
queen: QueenProfileSummary;
|
||||
isActive?: boolean;
|
||||
}
|
||||
|
||||
export default function SidebarQueenItem({ queen }: SidebarQueenItemProps) {
|
||||
export default function SidebarQueenItem({ queen, isActive }: SidebarQueenItemProps) {
|
||||
return (
|
||||
<NavLink
|
||||
to={`/queen/${queen.id}`}
|
||||
className={({ isActive }) =>
|
||||
className={({ isActive: isRouteActive }) =>
|
||||
`group flex items-center gap-2.5 px-3 py-1.5 mx-2 rounded-md text-sm transition-colors ${
|
||||
isActive
|
||||
isRouteActive
|
||||
? "bg-sidebar-active-bg text-foreground font-medium"
|
||||
: "text-foreground/70 hover:bg-sidebar-item-hover hover:text-foreground"
|
||||
}`
|
||||
}
|
||||
>
|
||||
<span className="flex-shrink-0 w-6 h-6 rounded-full bg-primary/15 flex items-center justify-center text-[10px] font-bold text-primary">
|
||||
<span className="relative flex-shrink-0 w-6 h-6 rounded-full bg-primary/15 flex items-center justify-center text-[10px] font-bold text-primary">
|
||||
{queen.name.charAt(0)}
|
||||
{isActive && (
|
||||
<span
|
||||
className="absolute -bottom-0.5 -right-0.5 w-2 h-2 rounded-full bg-emerald-500 ring-2 ring-sidebar-bg"
|
||||
title="Session running"
|
||||
/>
|
||||
)}
|
||||
</span>
|
||||
<div className="min-w-0 flex-1 flex items-center gap-2">
|
||||
<span className="font-medium truncate">{queen.name}</span>
|
||||
|
||||
@@ -457,12 +457,12 @@ let currentView = 'grid';
|
||||
|
||||
// Tool categories for sidebar grouping
|
||||
const CATEGORIES = {
|
||||
'Lifecycle': ['browser_start', 'browser_stop', 'browser_status'],
|
||||
'Tabs': ['browser_tabs', 'browser_open', 'browser_close', 'browser_focus'],
|
||||
'Lifecycle': ['browser_setup', 'browser_start', 'browser_stop', 'browser_status'],
|
||||
'Tabs': ['browser_tabs', 'browser_open', 'browser_close', 'browser_close_all', 'browser_close_finished', 'browser_focus'],
|
||||
'Navigation': ['browser_navigate', 'browser_go_back', 'browser_go_forward', 'browser_reload'],
|
||||
'Interactions': ['browser_click', 'browser_click_coordinate', 'browser_type', 'browser_fill', 'browser_press', 'browser_press_at', 'browser_hover', 'browser_hover_coordinate', 'browser_select', 'browser_scroll'],
|
||||
'Inspection': ['browser_screenshot', 'browser_snapshot', 'browser_console', 'browser_get_text', 'browser_evaluate', 'browser_wait'],
|
||||
'Advanced': ['browser_resize', 'browser_upload', 'browser_dialog', 'browser_coords'],
|
||||
'Interactions': ['browser_click', 'browser_click_coordinate', 'browser_type', 'browser_type_focused', 'browser_fill', 'browser_press', 'browser_press_at', 'browser_hover', 'browser_hover_coordinate', 'browser_select', 'browser_scroll', 'browser_drag'],
|
||||
'Inspection': ['browser_screenshot', 'browser_snapshot', 'browser_console', 'browser_html', 'browser_get_text', 'browser_get_attribute', 'browser_get_rect', 'browser_shadow_query', 'browser_evaluate', 'browser_wait'],
|
||||
'Advanced': ['browser_resize', 'browser_upload', 'browser_dialog'],
|
||||
};
|
||||
|
||||
async function init() {
|
||||
|
||||
@@ -88,13 +88,13 @@ Find Textarea (it is hidden inside shadow DOM):
|
||||
```
|
||||
Click that coordinate, `sleep(1)`.
|
||||
|
||||
Inject text and Send:
|
||||
Type the message:
|
||||
Construct the message: `Hey {first_name}, thanks for the connection invite! I'm currently building a prediction market for jobs: https://honeycomb.open-hive.com/. If you could check it out and share some feedback, I'd really appreciate it.`
|
||||
|
||||
Escape the string properly for JS injection, then run:
|
||||
```javascript
|
||||
// Replace MSG_TEXT with your actual string
|
||||
browser_evaluate("(function(){ document.execCommand('insertText', false, `MSG_TEXT`); return true; })()")
|
||||
Use `browser_type_focused` — it dispatches CDP `Input.insertText` to the already-focused composer (document.activeElement), which works through shadow DOM without JSON-escaping issues:
|
||||
```
|
||||
browser_type_focused(text=message_text)
|
||||
sleep(1.0)
|
||||
```
|
||||
|
||||
Find Send button (also inside shadow DOM):
|
||||
|
||||
+195
-60
@@ -80,33 +80,70 @@ async def _adaptive_poll_sleep(elapsed_s: float) -> None:
|
||||
_interaction_highlights: dict[int, dict] = {}
|
||||
|
||||
|
||||
def clear_tab_highlights(tab_ids) -> None:
|
||||
"""Drop cached interaction highlights for the given tab_ids.
|
||||
|
||||
Called when a profile's context is destroyed so stale highlight
|
||||
rects can't reappear on a later tab that Chrome happens to assign
|
||||
the same id. Accepts a single id or any iterable.
|
||||
"""
|
||||
if isinstance(tab_ids, int):
|
||||
tab_ids = (tab_ids,)
|
||||
for tid in tab_ids:
|
||||
_interaction_highlights.pop(tid, None)
|
||||
|
||||
|
||||
# Compact descriptor of document.activeElement. Returned by both click()
|
||||
# and click_coordinate() so the agent can verify it focused what it
|
||||
# intended, then decide whether to follow up with browser_type(text=...,
|
||||
# no selector). Keeping this as a single shared string avoids drift
|
||||
# between the two click paths.
|
||||
# intended. When the outer document's activeElement is an <iframe>,
|
||||
# we recurse into the iframe's document (same-origin only) so the
|
||||
# response describes the real inner element — otherwise the agent
|
||||
# always sees {tag: "iframe"} and can't tell whether it hit the
|
||||
# composer or something else inside the frame (e.g. a sidebar item
|
||||
# in LinkedIn's #interop-outlet messaging overlay).
|
||||
_FOCUSED_ELEMENT_JS = """
|
||||
(function() {
|
||||
function describe(el) {
|
||||
var rect = el.getBoundingClientRect();
|
||||
var attrs = {};
|
||||
for (var i = 0; i < el.attributes.length && i < 10; i++) {
|
||||
attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
|
||||
}
|
||||
return {
|
||||
tag: el.tagName.toLowerCase(),
|
||||
id: el.id || null,
|
||||
className: el.className || null,
|
||||
name: el.getAttribute('name') || null,
|
||||
type: el.getAttribute('type') || null,
|
||||
role: el.getAttribute('role') || null,
|
||||
contenteditable: el.getAttribute('contenteditable') || null,
|
||||
text: (el.innerText || '').substring(0, 200),
|
||||
value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
|
||||
attributes: attrs,
|
||||
rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
|
||||
};
|
||||
}
|
||||
var el = document.activeElement;
|
||||
if (!el || el === document.body) return null;
|
||||
var rect = el.getBoundingClientRect();
|
||||
var attrs = {};
|
||||
for (var i = 0; i < el.attributes.length && i < 10; i++) {
|
||||
attrs[el.attributes[i].name] = el.attributes[i].value.substring(0, 200);
|
||||
// Descend into same-origin iframes. Capped at 5 levels of
|
||||
// nesting to bound cost. Cross-origin frames throw on
|
||||
// contentDocument access → we catch and report the outermost
|
||||
// iframe instead.
|
||||
var framePath = [];
|
||||
var depth = 0;
|
||||
while (el && (el.tagName === 'IFRAME' || el.tagName === 'FRAME') && depth < 5) {
|
||||
framePath.push(el.id || el.getAttribute('data-testid') || el.tagName.toLowerCase());
|
||||
var innerDoc = null;
|
||||
try { innerDoc = el.contentDocument; } catch (e) { innerDoc = null; }
|
||||
if (!innerDoc) break;
|
||||
var innerActive = innerDoc.activeElement;
|
||||
if (!innerActive || innerActive === innerDoc.body) break;
|
||||
el = innerActive;
|
||||
depth++;
|
||||
}
|
||||
return {
|
||||
tag: el.tagName.toLowerCase(),
|
||||
id: el.id || null,
|
||||
className: el.className || null,
|
||||
name: el.getAttribute('name') || null,
|
||||
type: el.getAttribute('type') || null,
|
||||
role: el.getAttribute('role') || null,
|
||||
contenteditable: el.getAttribute('contenteditable') || null,
|
||||
text: (el.innerText || '').substring(0, 200),
|
||||
value: (el.value !== undefined ? String(el.value).substring(0, 200) : null),
|
||||
attributes: attrs,
|
||||
rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }
|
||||
};
|
||||
var out = describe(el);
|
||||
if (framePath.length) out.inFrame = framePath;
|
||||
return out;
|
||||
})()
|
||||
"""
|
||||
|
||||
@@ -464,11 +501,14 @@ class BeelineBridge:
|
||||
"""Close a tab by ID."""
|
||||
result = await self._send("tab.close", tabId=tab_id)
|
||||
# Drop per-tab state — the id may be reused by Chrome much
|
||||
# later, and carrying a stale highlight or "attached" flag
|
||||
# forward would misannotate screenshots or skip a needed
|
||||
# reattach on the reused id.
|
||||
# later, and carrying a stale highlight, scale, or "attached"
|
||||
# flag forward would misannotate screenshots, misalign click
|
||||
# coordinates, or skip a needed reattach on the reused id.
|
||||
self._cdp_attached.discard(tab_id)
|
||||
_interaction_highlights.pop(tab_id, None)
|
||||
from .tools.inspection import clear_tab_state
|
||||
|
||||
clear_tab_state(tab_id)
|
||||
return result
|
||||
|
||||
async def list_tabs(self, group_id: int | None = None) -> dict:
|
||||
@@ -937,16 +977,36 @@ class BeelineBridge:
|
||||
async def _read_focused_element(self, tab_id: int) -> dict | None:
|
||||
"""Read document.activeElement and return a compact descriptor.
|
||||
|
||||
Returns None on any failure — never raises. Used by both click
|
||||
paths (selector-based click() and click_coordinate()) so the
|
||||
agent gets the same response shape regardless of which one was
|
||||
called. The descriptor lets the agent answer "did my click land
|
||||
on an editable?" without a second round-trip.
|
||||
The JS returns ``rect`` fields in CSS px (they come straight
|
||||
from ``getBoundingClientRect``). We convert them to fractions
|
||||
of the viewport here so the agent sees a rect in the same
|
||||
coord space it passed to click / hover / press_at.
|
||||
|
||||
Returns None on any failure — never raises.
|
||||
"""
|
||||
try:
|
||||
await self._try_enable_domain(tab_id, "Runtime")
|
||||
result = await self.evaluate(tab_id, _FOCUSED_ELEMENT_JS)
|
||||
return (result or {}).get("result")
|
||||
info = (result or {}).get("result")
|
||||
if info and isinstance(info, dict) and isinstance(info.get("rect"), dict):
|
||||
from .tools.inspection import _viewport_sizes
|
||||
|
||||
vp = _viewport_sizes.get(tab_id)
|
||||
if vp and vp[0] > 0 and vp[1] > 0:
|
||||
cw, ch = float(vp[0]), float(vp[1])
|
||||
r = info["rect"]
|
||||
info["rect"] = {
|
||||
"x": round(r.get("x", 0) / cw, 4),
|
||||
"y": round(r.get("y", 0) / ch, 4),
|
||||
"width": round(r.get("width", 0) / cw, 4),
|
||||
"height": round(r.get("height", 0) / ch, 4),
|
||||
}
|
||||
else:
|
||||
# Degraded: cache missing (no screenshot taken
|
||||
# yet). Leave rect in CSS px and flag it so the
|
||||
# agent can tell.
|
||||
info["rectSpace"] = "css"
|
||||
return info
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@@ -959,18 +1019,11 @@ class BeelineBridge:
|
||||
button_map = {"left": "left", "right": "right", "middle": "middle"}
|
||||
cdp_button = button_map.get(button, "left")
|
||||
|
||||
from .tools.inspection import _screenshot_css_scales, _screenshot_scales
|
||||
|
||||
phys_scale = _screenshot_scales.get(tab_id, "unset")
|
||||
css_scale = _screenshot_css_scales.get(tab_id, "unset")
|
||||
logger.info(
|
||||
"click_coordinate tab=%d: x=%.1f, y=%.1f → CDP Input.dispatchMouseEvent. "
|
||||
"stored_scales: physicalScale=%s, cssScale=%s",
|
||||
"click_coordinate tab=%d: x=%.1f, y=%.1f → CDP Input.dispatchMouseEvent",
|
||||
tab_id,
|
||||
x,
|
||||
y,
|
||||
phys_scale,
|
||||
css_scale,
|
||||
)
|
||||
|
||||
await self._cdp(
|
||||
@@ -1113,7 +1166,9 @@ class BeelineBridge:
|
||||
# element (e.g. via browser_click_coordinate). Just clear the
|
||||
# active element if requested, then insert text directly.
|
||||
if clear_first:
|
||||
await self.evaluate(tab_id, """
|
||||
await self.evaluate(
|
||||
tab_id,
|
||||
"""
|
||||
(function() {
|
||||
const el = document.activeElement;
|
||||
if (!el) return;
|
||||
@@ -1125,7 +1180,8 @@ class BeelineBridge:
|
||||
el.dispatchEvent(new Event('input', {bubbles: true}));
|
||||
}
|
||||
})();
|
||||
""")
|
||||
""",
|
||||
)
|
||||
|
||||
if use_insert_text and delay_ms <= 0:
|
||||
# CDP Input.insertText is the most reliable way to insert
|
||||
@@ -1177,16 +1233,26 @@ class BeelineBridge:
|
||||
if rect:
|
||||
await self.highlight_rect(tab_id, rect["x"], rect["y"], rect["w"], rect["h"], label=selector)
|
||||
else:
|
||||
# Highlight the active element when no selector was provided
|
||||
# Highlight the active element when no selector was provided.
|
||||
# Drill into same-origin iframes to find the real focused
|
||||
# element — the top-level activeElement may be a full-screen
|
||||
# iframe whose rect covers the entire viewport.
|
||||
rect_result = await self.evaluate(
|
||||
tab_id,
|
||||
"(function(){const el=document.activeElement;if(!el)return null;"
|
||||
"(function(){"
|
||||
"var el=document.activeElement;"
|
||||
"try{while(el&&el.tagName==='IFRAME'&&el.contentDocument){"
|
||||
"el=el.contentDocument.activeElement;"
|
||||
"}}catch(e){}"
|
||||
"if(!el||el===document.body||el===document.documentElement)return null;"
|
||||
"const r=el.getBoundingClientRect();"
|
||||
"return{x:r.left,y:r.top,w:r.width,h:r.height};})()",
|
||||
)
|
||||
rect = (rect_result or {}).get("result")
|
||||
if rect:
|
||||
await self.highlight_rect(tab_id, rect["x"], rect["y"], rect["w"], rect["h"], label="active element")
|
||||
await self.highlight_rect(
|
||||
tab_id, rect["x"], rect["y"], rect["w"], rect["h"], label="active element", border_style="dashed"
|
||||
)
|
||||
return {"ok": True, "action": "type", "selector": selector, "length": len(text)}
|
||||
|
||||
# CDP Input.dispatchKeyEvent modifiers bitmask.
|
||||
@@ -1556,6 +1622,7 @@ class BeelineBridge:
|
||||
h: float,
|
||||
label: str = "",
|
||||
color: dict | None = None,
|
||||
border_style: str = "solid",
|
||||
) -> None:
|
||||
"""Inject a visible highlight overlay into the page DOM.
|
||||
|
||||
@@ -1584,7 +1651,7 @@ class BeelineBridge:
|
||||
box.id = '__hive_hl';
|
||||
box.style.cssText = 'position:fixed;z-index:2147483647;pointer-events:none;'
|
||||
+ 'left:{int(x)}px;top:{int(y)}px;width:{max(1, int(w))}px;height:{max(1, int(h))}px;'
|
||||
+ 'border:2px solid {border_rgb};background:{bg_rgba};'
|
||||
+ 'border:2px {border_style} {border_rgb};background:{bg_rgba};'
|
||||
+ 'border-radius:3px;transition:opacity 0.4s ease;opacity:1;'
|
||||
+ 'box-shadow:0 0 8px {bg_rgba};';
|
||||
|
||||
@@ -1927,7 +1994,7 @@ class BeelineBridge:
|
||||
"result": value,
|
||||
}
|
||||
|
||||
async def snapshot(self, tab_id: int, timeout_s: float = 30.0) -> dict:
|
||||
async def snapshot(self, tab_id: int, timeout_s: float = 30.0, mode: str = "default") -> dict:
|
||||
"""Get an accessibility snapshot of the page.
|
||||
|
||||
Uses a hybrid approach:
|
||||
@@ -1938,6 +2005,7 @@ class BeelineBridge:
|
||||
Args:
|
||||
tab_id: The tab ID to snapshot
|
||||
timeout_s: Maximum time to spend building snapshot (default 10s)
|
||||
mode: Filtering mode — "default", "simple", or "interactive"
|
||||
"""
|
||||
try:
|
||||
async with asyncio.timeout(timeout_s):
|
||||
@@ -1969,8 +2037,11 @@ class BeelineBridge:
|
||||
)
|
||||
return await self._dom_snapshot(tab_id)
|
||||
|
||||
# Clean redundant InlineTextBox children before formatting
|
||||
nodes = self._clean_inline_text_boxes(nodes)
|
||||
|
||||
# Format the accessibility tree (with node limit)
|
||||
snapshot = self._format_ax_tree(nodes, max_nodes=2000)
|
||||
snapshot = self._format_ax_tree(nodes, max_nodes=2000, mode=mode)
|
||||
|
||||
# Get URL
|
||||
url_result = await self._cdp(
|
||||
@@ -2104,13 +2175,78 @@ class BeelineBridge:
|
||||
"tree": "\n".join(lines),
|
||||
}
|
||||
|
||||
def _format_ax_tree(self, nodes: list[dict], max_nodes: int = 2000) -> str:
|
||||
@staticmethod
|
||||
def _clean_inline_text_boxes(nodes: list[dict]) -> list[dict]:
|
||||
"""Remove redundant InlineTextBox children from StaticText nodes.
|
||||
|
||||
If a StaticText node has 3+ InlineTextBox children and ALL their
|
||||
text is already contained in the StaticText's name, remove all
|
||||
the InlineTextBox children (they add no information).
|
||||
"""
|
||||
by_id = {n["nodeId"]: n for n in nodes}
|
||||
children_map: dict[str, list[str]] = {}
|
||||
for n in nodes:
|
||||
for child_id in n.get("childIds", []):
|
||||
children_map.setdefault(n["nodeId"], []).append(child_id)
|
||||
|
||||
ids_to_remove: set[str] = set()
|
||||
|
||||
for n in nodes:
|
||||
role_info = n.get("role", {})
|
||||
role = role_info.get("value", "") if isinstance(role_info, dict) else str(role_info)
|
||||
if role != "StaticText":
|
||||
continue
|
||||
|
||||
child_ids = children_map.get(n["nodeId"], [])
|
||||
if len(child_ids) < 3:
|
||||
continue
|
||||
|
||||
name_info = n.get("name", {})
|
||||
parent_name = name_info.get("value", "") if isinstance(name_info, dict) else str(name_info)
|
||||
if not parent_name:
|
||||
continue
|
||||
|
||||
all_inline = True
|
||||
for cid in child_ids:
|
||||
child = by_id.get(cid)
|
||||
if not child:
|
||||
all_inline = False
|
||||
break
|
||||
child_role_info = child.get("role", {})
|
||||
child_role = (
|
||||
child_role_info.get("value", "") if isinstance(child_role_info, dict) else str(child_role_info)
|
||||
)
|
||||
if child_role != "InlineTextBox":
|
||||
all_inline = False
|
||||
break
|
||||
child_name_info = child.get("name", {})
|
||||
child_name = (
|
||||
child_name_info.get("value", "") if isinstance(child_name_info, dict) else str(child_name_info)
|
||||
)
|
||||
if child_name and child_name not in parent_name:
|
||||
all_inline = False
|
||||
break
|
||||
|
||||
if all_inline:
|
||||
ids_to_remove.update(child_ids)
|
||||
n["childIds"] = []
|
||||
|
||||
if not ids_to_remove:
|
||||
return nodes
|
||||
|
||||
return [n for n in nodes if n["nodeId"] not in ids_to_remove]
|
||||
|
||||
def _format_ax_tree(self, nodes: list[dict], max_nodes: int = 2000, mode: str = "default") -> str:
|
||||
"""Format a CDP Accessibility.getFullAXTree result.
|
||||
|
||||
Args:
|
||||
nodes: List of accessibility tree nodes
|
||||
max_nodes: Maximum number of nodes to process (prevents hangs on huge trees)
|
||||
mode: Filtering mode — "default" (full tree), "simple" (interactive +
|
||||
content, skip unnamed structural), "interactive" (interactive only)
|
||||
"""
|
||||
from .refs import INTERACTIVE_ROLES, STRUCTURAL_ROLES
|
||||
|
||||
if not nodes:
|
||||
return "(empty tree)"
|
||||
|
||||
@@ -2150,11 +2286,21 @@ class BeelineBridge:
|
||||
_walk(cid, depth)
|
||||
return
|
||||
|
||||
node_counter[0] += 1
|
||||
|
||||
name_info = node.get("name", {})
|
||||
name = name_info.get("value", "") if isinstance(name_info, dict) else str(name_info)
|
||||
|
||||
# Mode-based filtering — skip node but walk children at same depth
|
||||
if mode == "interactive" and role not in INTERACTIVE_ROLES:
|
||||
for cid in children_map.get(node_id, []):
|
||||
_walk(cid, depth)
|
||||
return
|
||||
if mode == "simple" and role in STRUCTURAL_ROLES and not name:
|
||||
for cid in children_map.get(node_id, []):
|
||||
_walk(cid, depth)
|
||||
return
|
||||
|
||||
node_counter[0] += 1
|
||||
|
||||
# Build property annotations
|
||||
props: list[str] = []
|
||||
for prop in node.get("properties", []):
|
||||
@@ -2171,18 +2317,7 @@ class BeelineBridge:
|
||||
label = f"- {role}"
|
||||
|
||||
# Add ref for interactive elements
|
||||
interactive_roles = {
|
||||
"button",
|
||||
"link",
|
||||
"textbox",
|
||||
"checkbox",
|
||||
"radio",
|
||||
"combobox",
|
||||
"menuitem",
|
||||
"tab",
|
||||
"searchbox",
|
||||
}
|
||||
if role in interactive_roles or name:
|
||||
if role in INTERACTIVE_ROLES or name:
|
||||
ref_counter[0] += 1
|
||||
ref_id = f"e{ref_counter[0]}"
|
||||
ref_map[ref_id] = f"[{role}]{name}"
|
||||
|
||||
@@ -13,7 +13,13 @@ from typing import TYPE_CHECKING
|
||||
if TYPE_CHECKING:
|
||||
from .session import BrowserSession
|
||||
|
||||
# Role sets for interactive elements
|
||||
"""Shared ARIA role classification sets.
|
||||
|
||||
Keep these in sync across snapshot paths — divergence causes different
|
||||
drivers to produce different snapshot output for the same page.
|
||||
"""
|
||||
|
||||
# Roles that represent user-interactive elements and always get a ref.
|
||||
INTERACTIVE_ROLES: frozenset[str] = frozenset(
|
||||
{
|
||||
"button",
|
||||
@@ -26,7 +32,6 @@ INTERACTIVE_ROLES: frozenset[str] = frozenset(
|
||||
"menuitemradio",
|
||||
"option",
|
||||
"radio",
|
||||
"scrollbar",
|
||||
"searchbox",
|
||||
"slider",
|
||||
"spinbutton",
|
||||
@@ -37,11 +42,44 @@ INTERACTIVE_ROLES: frozenset[str] = frozenset(
|
||||
}
|
||||
)
|
||||
|
||||
NAMED_CONTENT_ROLES: frozenset[str] = frozenset(
|
||||
# Roles that carry meaningful content and get a ref when named.
|
||||
CONTENT_ROLES: frozenset[str] = frozenset(
|
||||
{
|
||||
"article",
|
||||
"cell",
|
||||
"columnheader",
|
||||
"gridcell",
|
||||
"heading",
|
||||
"img",
|
||||
"listitem",
|
||||
"main",
|
||||
"navigation",
|
||||
"region",
|
||||
"rowheader",
|
||||
}
|
||||
)
|
||||
|
||||
# Structural/container roles — typically skipped in compact mode.
|
||||
STRUCTURAL_ROLES: frozenset[str] = frozenset(
|
||||
{
|
||||
"application",
|
||||
"directory",
|
||||
"document",
|
||||
"generic",
|
||||
"grid",
|
||||
"group",
|
||||
"ignored",
|
||||
"list",
|
||||
"menu",
|
||||
"menubar",
|
||||
"none",
|
||||
"presentation",
|
||||
"row",
|
||||
"rowgroup",
|
||||
"table",
|
||||
"tablist",
|
||||
"toolbar",
|
||||
"tree",
|
||||
"treegrid",
|
||||
}
|
||||
)
|
||||
|
||||
@@ -81,7 +119,7 @@ def annotate_snapshot(snapshot: str) -> tuple[str, RefMap]:
|
||||
role = m.group(2)
|
||||
name = m.group(3)
|
||||
|
||||
if role in INTERACTIVE_ROLES or (role in NAMED_CONTENT_ROLES and name):
|
||||
if role in INTERACTIVE_ROLES or (role in CONTENT_ROLES and name):
|
||||
candidates.append((i, role, name))
|
||||
|
||||
ref_map: RefMap = {}
|
||||
|
||||
@@ -255,6 +255,17 @@ def register_advanced_tools(mcp: FastMCP) -> None:
|
||||
|
||||
try:
|
||||
result = await bridge.resize(target_tab, width, height)
|
||||
# Invalidate per-tab scale caches — CSS width changed, so the
|
||||
# cached viewport dimensions are stale. Click / rect tools
|
||||
# will re-query innerWidth / innerHeight on next use via
|
||||
# _ensure_viewport_size.
|
||||
try:
|
||||
from .inspection import _screenshot_scales, _viewport_sizes
|
||||
|
||||
_viewport_sizes.pop(target_tab, None)
|
||||
_screenshot_scales.pop(target_tab, None)
|
||||
except Exception:
|
||||
pass
|
||||
return result
|
||||
except Exception as e:
|
||||
return {"ok": False, "error": str(e)}
|
||||
|
||||
@@ -23,13 +23,40 @@ from .tabs import _get_context
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Target width for normalized screenshots (px in the delivered image)
|
||||
_SCREENSHOT_WIDTH = 600
|
||||
|
||||
# Maps tab_id -> physical scale: image_coord × scale = physical pixels (for CDP Input events)
|
||||
# Fixed output width for all screenshots (bandwidth default). This
|
||||
# number does NOT affect coordinate semantics — click / hover / press
|
||||
# and rect tools all work in fractions of the viewport (0..1), which
|
||||
# are invariant to whatever resize / tile the vision API applies. The
|
||||
# 800 px width is simply small enough to keep JPEG payloads under
|
||||
# ~150 KB on typical UI screenshots.
|
||||
_SCREENSHOT_WIDTH = 800
|
||||
|
||||
# Per-tab viewport-size cache populated on every browser_screenshot
|
||||
# and on lazy-init inside the click tools. Stores CSS-pixel viewport
|
||||
# dimensions (window.innerWidth / window.innerHeight). Click tools
|
||||
# multiply fractional inputs by these to get CSS coords before
|
||||
# dispatching CDP events; rect tools divide CSS-pixel DOM rects by
|
||||
# these to produce fractions for the agent.
|
||||
_viewport_sizes: dict[int, tuple[int, int]] = {}
|
||||
|
||||
# Optional debug cache — physical-px scale per tab (orig_png_w /
|
||||
# _SCREENSHOT_WIDTH). Logged only; no consumer.
|
||||
_screenshot_scales: dict[int, float] = {}
|
||||
# Maps tab_id -> CSS scale: image_coord × scale = CSS pixels (for DOM APIs / getBoundingClientRect)
|
||||
_screenshot_css_scales: dict[int, float] = {}
|
||||
|
||||
|
||||
def clear_tab_state(tab_ids) -> None:
|
||||
"""Drop cached screenshot scales for the given tab_ids.
|
||||
|
||||
Called when a tab closes or a profile's context is destroyed so stale
|
||||
scale values can't bleed into a later tab that Chrome happens to assign
|
||||
the same id. Accepts a single id or any iterable.
|
||||
"""
|
||||
if isinstance(tab_ids, int):
|
||||
tab_ids = (tab_ids,)
|
||||
for tid in tab_ids:
|
||||
_screenshot_scales.pop(tid, None)
|
||||
_screenshot_css_scales.pop(tid, None)
|
||||
|
||||
|
||||
def _resize_and_annotate(
|
||||
@@ -37,18 +64,25 @@ def _resize_and_annotate(
|
||||
css_width: int,
|
||||
dpr: float = 1.0,
|
||||
highlights: list[dict] | None = None,
|
||||
width: int = _SCREENSHOT_WIDTH,
|
||||
) -> tuple[str, float, float]:
|
||||
"""Resize a base64 PNG to _SCREENSHOT_WIDTH wide, annotate highlights.
|
||||
) -> tuple[str, float]:
|
||||
"""Resize the captured PNG down to ``_SCREENSHOT_WIDTH`` (=800 px)
|
||||
and re-encode as JPEG quality 75.
|
||||
|
||||
Returns (new_b64, physical_scale, css_scale) where:
|
||||
physical_scale = physical_px_per_image_px (multiply image coords → physical px)
|
||||
css_scale = css_px_per_image_px (multiply image coords → CSS px for DOM APIs)
|
||||
The image dimensions do NOT determine click coordinates any more —
|
||||
the tools work in viewport fractions. This helper exists purely
|
||||
for bandwidth + annotation overlay. Returns ``(new_b64,
|
||||
physical_scale)`` where ``physical_scale = orig_png_w / output_w``
|
||||
is kept for debug logging.
|
||||
|
||||
Highlights have x,y,w,h in CSS pixels (what getBoundingClientRect returns,
|
||||
and what CDP Input.dispatchMouseEvent accepts).
|
||||
Falls back to original data if Pillow unavailable or resize fails.
|
||||
Highlight rects arrive in CSS px; they're converted to image-space
|
||||
for overlay drawing via the local ``css_to_image = css_width /
|
||||
output_w`` factor (computed inline — no external cache).
|
||||
"""
|
||||
if not css_width or css_width <= 0:
|
||||
# Bridge always supplies css_width from window.innerWidth; only
|
||||
# reach here on a degraded response. Return the raw PNG.
|
||||
return data, 1.0
|
||||
|
||||
try:
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
except ImportError:
|
||||
@@ -58,48 +92,44 @@ def _resize_and_annotate(
|
||||
import struct
|
||||
|
||||
orig_w = struct.unpack(">I", raw[16:20])[0]
|
||||
raw_size_bytes = len(raw)
|
||||
physical_scale = orig_w / width if orig_w and width else 1.0
|
||||
css_scale = (css_width / width) if css_width else (physical_scale / max(dpr, 1.0))
|
||||
physical_scale = orig_w / _SCREENSHOT_WIDTH if orig_w else 1.0
|
||||
logger.warning(
|
||||
"PIL not available — screenshot resize SKIPPED (cannot downscale image). "
|
||||
"raw_size=%d bytes, png_width=%d, css_width=%s, dpr=%s, target_width=%d. "
|
||||
"Returning ORIGINAL image with computed scales: physicalScale=%.4f, cssScale=%.4f. "
|
||||
"Agent must use browser_coords() to convert image positions before clicking.",
|
||||
raw_size_bytes,
|
||||
orig_w,
|
||||
"PIL not available — screenshot resize SKIPPED. "
|
||||
"Returning raw physical-px PNG. physicalScale=%.4f, "
|
||||
"css_width=%d, dpr=%s. Install Pillow for annotation.",
|
||||
physical_scale,
|
||||
css_width,
|
||||
dpr,
|
||||
width,
|
||||
physical_scale,
|
||||
css_scale,
|
||||
)
|
||||
return data, round(physical_scale, 4), round(css_scale, 4)
|
||||
return data, round(physical_scale, 4)
|
||||
|
||||
try:
|
||||
raw = base64.b64decode(data)
|
||||
img = Image.open(io.BytesIO(raw)).convert("RGBA")
|
||||
orig_w, orig_h = img.size
|
||||
|
||||
physical_scale = orig_w / width
|
||||
css_scale = (css_width / width) if css_width else (physical_scale / max(dpr, 1.0))
|
||||
physical_scale = orig_w / _SCREENSHOT_WIDTH
|
||||
new_w = _SCREENSHOT_WIDTH
|
||||
new_h = round(orig_h * new_w / orig_w)
|
||||
if (new_w, new_h) != img.size:
|
||||
img = img.resize((new_w, new_h), Image.LANCZOS)
|
||||
|
||||
# Local CSS → image px factor for overlay draws. Kept local —
|
||||
# not exported, not stored, not leaked to the agent.
|
||||
css_to_image = css_width / _SCREENSHOT_WIDTH
|
||||
|
||||
logger.info(
|
||||
"Screenshot resize: orig=%dx%d → target=%dx%d, css_width=%s, dpr=%s, physicalScale=%.4f, cssScale=%.4f",
|
||||
"Screenshot: orig=%dx%d → out=%dx%d (css_width=%d, dpr=%s), physicalScale=%.4f, css_to_image=%.4f",
|
||||
orig_w,
|
||||
orig_h,
|
||||
width,
|
||||
round(orig_h * width / orig_w),
|
||||
new_w,
|
||||
new_h,
|
||||
css_width,
|
||||
dpr,
|
||||
physical_scale,
|
||||
css_scale,
|
||||
css_to_image,
|
||||
)
|
||||
|
||||
new_w = width
|
||||
new_h = round(orig_h * new_w / orig_w)
|
||||
img = img.resize((new_w, new_h), Image.LANCZOS)
|
||||
|
||||
if highlights:
|
||||
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
|
||||
draw = ImageDraw.Draw(overlay)
|
||||
@@ -111,11 +141,11 @@ def _resize_and_annotate(
|
||||
for h in highlights:
|
||||
kind = h.get("kind", "rect")
|
||||
label = h.get("label", "")
|
||||
# Highlights are in CSS px → convert to image px
|
||||
ix = h["x"] / css_scale
|
||||
iy = h["y"] / css_scale
|
||||
iw = h.get("w", 0) / css_scale
|
||||
ih = h.get("h", 0) / css_scale
|
||||
# Highlights arrive in CSS px → convert to image px.
|
||||
ix = h["x"] / css_to_image
|
||||
iy = h["y"] / css_to_image
|
||||
iw = h.get("w", 0) / css_to_image
|
||||
ih = h.get("h", 0) / css_to_image
|
||||
|
||||
if kind == "point":
|
||||
cx, cy, r = ix, iy, 10
|
||||
@@ -135,11 +165,9 @@ def _resize_and_annotate(
|
||||
width=2,
|
||||
)
|
||||
|
||||
# Label: show image pixel position so user knows where to look
|
||||
img_coords = f"img:({round(ix)},{round(iy)})"
|
||||
display_label = f"{img_coords} {label}" if label else img_coords
|
||||
display_label = f"({round(ix)},{round(iy)}) {label}".strip()
|
||||
lx, ly = ix, max(2, iy - 16)
|
||||
lx = max(2, min(lx, width - 120))
|
||||
lx = max(2, min(lx, new_w - 120))
|
||||
bbox = draw.textbbox((lx, ly), display_label, font=font)
|
||||
pad = 3
|
||||
draw.rectangle(
|
||||
@@ -153,22 +181,50 @@ def _resize_and_annotate(
|
||||
img = img.convert("RGB")
|
||||
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="PNG", optimize=True)
|
||||
img.save(buf, format="JPEG", quality=75, optimize=True)
|
||||
return (
|
||||
base64.b64encode(buf.getvalue()).decode(),
|
||||
round(physical_scale, 4),
|
||||
round(css_scale, 4),
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"Screenshot resize/annotate FAILED — returning original image with scale=1.0. "
|
||||
"css_width=%s, dpr=%s, target_width=%d. Clicks will be misaligned.",
|
||||
"Screenshot resize/annotate FAILED — returning original image. "
|
||||
"css_width=%s, dpr=%s.",
|
||||
css_width,
|
||||
dpr,
|
||||
width,
|
||||
exc_info=True,
|
||||
)
|
||||
return data, 1.0, 1.0
|
||||
return data, 1.0
|
||||
|
||||
|
||||
async def _ensure_viewport_size(tab_id: int) -> tuple[int, int]:
|
||||
"""Return ``(cssWidth, cssHeight)`` for ``tab_id``, populating the
|
||||
cache via ``window.innerWidth`` / ``window.innerHeight`` on miss.
|
||||
|
||||
Used by click / hover / press tools to turn fractional inputs
|
||||
(0..1) into CSS px, and by rect tools to turn CSS-px rects into
|
||||
fractions. Degrades to ``(1, 1)`` if the bridge can't be queried
|
||||
— that makes every coord an identity op, which is a safe no-op
|
||||
(and preferable to crashing).
|
||||
"""
|
||||
cached = _viewport_sizes.get(tab_id)
|
||||
if cached is not None and cached[0] > 0 and cached[1] > 0:
|
||||
return cached
|
||||
bridge = get_bridge()
|
||||
try:
|
||||
result = await bridge.evaluate(tab_id, "({w: window.innerWidth, h: window.innerHeight})")
|
||||
inner = (result or {}).get("result") or {}
|
||||
cw = int(float(inner.get("w") or 0))
|
||||
ch = int(float(inner.get("h") or 0))
|
||||
except Exception:
|
||||
cw, ch = 0, 0
|
||||
if cw <= 0 or ch <= 0:
|
||||
# Degraded: bridge didn't return viewport. Cache an identity
|
||||
# so we don't retry on every call; corrects itself after the
|
||||
# next successful browser_screenshot.
|
||||
cw, ch = 1, 1
|
||||
_viewport_sizes[tab_id] = (cw, ch)
|
||||
return cw, ch
|
||||
|
||||
|
||||
def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
@@ -180,29 +236,33 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
profile: str | None = None,
|
||||
full_page: bool = False,
|
||||
selector: str | None = None,
|
||||
image_type: Literal["png", "jpeg"] = "png",
|
||||
annotate: bool = True,
|
||||
width: int = _SCREENSHOT_WIDTH,
|
||||
) -> list:
|
||||
"""
|
||||
Take a screenshot of the current page.
|
||||
|
||||
Returns a normalized image alongside text metadata (URL, size, scale
|
||||
factors, etc.). Automatically annotates the last interaction (click,
|
||||
hover, type) with a bounding box overlay.
|
||||
Image is 800 px wide (JPEG quality 75, ~50–120 KB). All
|
||||
coordinate tools work in **fractions of the viewport (0..1)**,
|
||||
not pixels — so read a target's proportional position off this
|
||||
image ("~35 % from the left, ~20 % from the top") and pass
|
||||
``(0.35, 0.20)`` to ``browser_click_coordinate`` /
|
||||
``browser_hover_coordinate`` / ``browser_press_at``.
|
||||
``browser_get_rect`` and ``browser_shadow_query`` likewise
|
||||
return coordinates as fractions.
|
||||
|
||||
Args:
|
||||
tab_id: Chrome tab ID (default: active tab)
|
||||
profile: Browser profile name (default: "default")
|
||||
full_page: Capture full scrollable page (default: False)
|
||||
full_page: Capture full scrollable page (default: False).
|
||||
Note: full_page images extend beyond the viewport, so
|
||||
fractions read off them do NOT map cleanly to
|
||||
viewport-space clicks. Use for reading / overview only,
|
||||
not for pointing.
|
||||
selector: CSS selector to screenshot a specific element (optional)
|
||||
image_type: Image format - png or jpeg (default: png)
|
||||
annotate: Draw bounding box of last interaction on image (default: True)
|
||||
width: Output image width in pixels (default: 600). Use 800+ for fine
|
||||
text, 400 for quick layout checks.
|
||||
|
||||
Returns:
|
||||
List of content blocks: text metadata + image
|
||||
List of content blocks: text metadata + image.
|
||||
"""
|
||||
start = time.perf_counter()
|
||||
params = {
|
||||
@@ -252,7 +312,6 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
return [TextContent(type="text", text=json.dumps(screenshot_result))]
|
||||
|
||||
data = screenshot_result.get("data")
|
||||
mime_type = screenshot_result.get("mimeType", "image/png")
|
||||
css_width = screenshot_result.get("cssWidth", 0)
|
||||
dpr = screenshot_result.get("devicePixelRatio", 1.0)
|
||||
|
||||
@@ -263,45 +322,50 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
if annotate and target_tab in _interaction_highlights:
|
||||
highlights = [_interaction_highlights[target_tab]]
|
||||
|
||||
# Normalize to 800px wide and annotate. Offloaded to a
|
||||
# thread because PIL Image.open/resize/ImageDraw/composite on
|
||||
# a 2-megapixel PNG blocks for ~150-300ms of CPU — plenty to
|
||||
# freeze the asyncio event loop and delay every concurrent
|
||||
# tool call during a screenshot. The function is reentrant
|
||||
# (fresh PIL Image per call, no shared state), so to_thread
|
||||
# is safe.
|
||||
data, physical_scale, css_scale = await asyncio.to_thread(
|
||||
# Resize to CSS-viewport dimensions (image px == CSS px)
|
||||
# and re-encode as JPEG. Offloaded to a thread because PIL
|
||||
# Image.open/resize/ImageDraw/composite on a 2-megapixel
|
||||
# PNG blocks for ~150–300 ms of CPU — plenty to freeze the
|
||||
# asyncio event loop. Reentrant: no shared state.
|
||||
data, physical_scale = await asyncio.to_thread(
|
||||
_resize_and_annotate,
|
||||
data,
|
||||
css_width,
|
||||
dpr,
|
||||
highlights,
|
||||
width,
|
||||
)
|
||||
_screenshot_scales[target_tab] = physical_scale
|
||||
_screenshot_css_scales[target_tab] = css_scale
|
||||
# Cache live viewport dimensions so click / hover / press /
|
||||
# rect tools can translate fractions ↔ CSS px without
|
||||
# asking the page again.
|
||||
css_height = int(screenshot_result.get("cssHeight", 0)) or 0
|
||||
if target_tab is not None and css_width > 0 and css_height > 0:
|
||||
_viewport_sizes[target_tab] = (int(css_width), css_height)
|
||||
_screenshot_scales[target_tab] = physical_scale
|
||||
|
||||
meta = json.dumps(
|
||||
{
|
||||
"ok": True,
|
||||
"tabId": target_tab,
|
||||
"url": screenshot_result.get("url", ""),
|
||||
"imageType": mime_type.split("/")[-1],
|
||||
"imageType": "jpeg",
|
||||
"size": len(base64.b64decode(data)) if data else 0,
|
||||
"imageWidth": width,
|
||||
"imageWidth": _SCREENSHOT_WIDTH,
|
||||
"cssWidth": css_width,
|
||||
"cssHeight": css_height,
|
||||
"fullPage": full_page,
|
||||
"devicePixelRatio": dpr,
|
||||
"physicalScale": physical_scale,
|
||||
"cssScale": css_scale,
|
||||
"annotated": bool(highlights),
|
||||
"scaleHint": (
|
||||
f"image_coord × {css_scale} = CSS px "
|
||||
f"→ feed to browser_click_coordinate, "
|
||||
f"browser_hover_coordinate, browser_press_at "
|
||||
f"(CDP Input events use CSS pixels). "
|
||||
f"image_coord × {physical_scale} = physical px "
|
||||
f"is debug-only on HiDPI displays and must NOT "
|
||||
f"be used for clicks — it overshoots by DPR×."
|
||||
"Coordinates for click / hover / press are "
|
||||
"fractions 0..1 of the viewport. Read a "
|
||||
"target's proportional position off this image "
|
||||
"(e.g. '~35 % from the left, ~20 % from the top' "
|
||||
"→ (0.35, 0.20)) and pass that to "
|
||||
"browser_click_coordinate / "
|
||||
"browser_hover_coordinate / browser_press_at. "
|
||||
"browser_get_rect / browser_shadow_query / "
|
||||
"focused_element.rect return fractions too."
|
||||
),
|
||||
}
|
||||
)
|
||||
@@ -313,17 +377,17 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
"ok": True,
|
||||
"size": len(base64.b64decode(data)) if data else 0,
|
||||
"url": screenshot_result.get("url", ""),
|
||||
"cssWidth": css_width,
|
||||
"cssHeight": css_height,
|
||||
"physicalScale": physical_scale,
|
||||
"cssScale": css_scale,
|
||||
"debug_cssWidth": css_width,
|
||||
"debug_dpr": dpr,
|
||||
"dpr": dpr,
|
||||
},
|
||||
duration_ms=(time.perf_counter() - start) * 1000,
|
||||
)
|
||||
|
||||
return [
|
||||
TextContent(type="text", text=meta),
|
||||
ImageContent(type="image", data=data, mimeType=mime_type),
|
||||
ImageContent(type="image", data=data, mimeType="image/jpeg"),
|
||||
]
|
||||
except Exception as e:
|
||||
log_tool_call(
|
||||
@@ -334,73 +398,6 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
)
|
||||
return [TextContent(type="text", text=json.dumps({"ok": False, "error": str(e)}))]
|
||||
|
||||
@mcp.tool()
|
||||
def browser_coords(
|
||||
x: float,
|
||||
y: float,
|
||||
tab_id: int | None = None,
|
||||
profile: str | None = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Convert screenshot image coordinates to browser click coordinates.
|
||||
|
||||
After browser_screenshot returns a downscaled image, use this to
|
||||
translate pixel positions you see in the image into the CSS pixel
|
||||
coordinates that Chrome DevTools Protocol expects.
|
||||
|
||||
**CDP Input.dispatchMouseEvent uses CSS pixels**, so you want
|
||||
``css_x`` / ``css_y`` for every click/hover tool. ``physical_x/y``
|
||||
is kept in the return for debugging on HiDPI displays — do NOT
|
||||
feed it to clicks; on a DPR=2 screen it lands 2× too far.
|
||||
|
||||
Edge case: pages using ``zoom`` or ``transform: scale()`` (e.g.
|
||||
LinkedIn's ``#interop-outlet`` shadow DOM) render in a scaled
|
||||
local coordinate space. For those, ``getBoundingClientRect()``
|
||||
reports pre-zoom coordinates and you may still need to multiply
|
||||
by the element's effective zoom. Use browser_shadow_query to
|
||||
get the zoomed rect directly.
|
||||
|
||||
Args:
|
||||
x: X pixel position in the screenshot image
|
||||
y: Y pixel position in the screenshot image
|
||||
tab_id: Chrome tab ID (default: active tab for profile)
|
||||
profile: Browser profile name (default: "default")
|
||||
|
||||
Returns:
|
||||
Dict with css_x, css_y (primary — use these), physical_x,
|
||||
physical_y (debug only), and scale factors.
|
||||
"""
|
||||
ctx = _get_context(profile)
|
||||
target_tab = tab_id or (ctx.get("activeTabId") if ctx else None)
|
||||
|
||||
physical_scale = _screenshot_scales.get(target_tab, 1.0) if target_tab else 1.0
|
||||
# css_scale stored in second slot via _screenshot_css_scales
|
||||
css_scale = _screenshot_css_scales.get(target_tab, physical_scale) if target_tab else physical_scale
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
# Primary output: CSS pixels. Feed these to click/hover/press.
|
||||
"css_x": round(x * css_scale, 1),
|
||||
"css_y": round(y * css_scale, 1),
|
||||
# Debug output: raw physical pixels. DO NOT feed to clicks on
|
||||
# HiDPI displays — CDP Input events use CSS pixels, so sending
|
||||
# physical coordinates lands the click at roughly DPR× the
|
||||
# intended position.
|
||||
"physical_x": round(x * physical_scale, 1),
|
||||
"physical_y": round(y * physical_scale, 1),
|
||||
"physicalScale": physical_scale,
|
||||
"cssScale": css_scale,
|
||||
"tabId": target_tab,
|
||||
"note": (
|
||||
"Use css_x/css_y with browser_click_coordinate, "
|
||||
"browser_hover_coordinate, browser_press_at — "
|
||||
"Chrome DevTools Protocol Input.dispatchMouseEvent "
|
||||
"operates in CSS pixels. physical_x/y is for debugging "
|
||||
"on HiDPI displays only; feeding it to clicks lands "
|
||||
"them at DPR× the intended coordinate."
|
||||
),
|
||||
}
|
||||
|
||||
@mcp.tool()
|
||||
async def browser_shadow_query(
|
||||
selector: str,
|
||||
@@ -412,7 +409,9 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
|
||||
Traverses shadow roots to find elements inside closed/open shadow DOM,
|
||||
overlays, and virtual-rendered components (e.g. LinkedIn's #interop-outlet).
|
||||
Returns getBoundingClientRect in both CSS and physical pixels.
|
||||
Returns the element's bounding rect as **fractions of the
|
||||
viewport (0..1)** — feed ``rect.cx`` / ``rect.cy`` straight
|
||||
into browser_click_coordinate / hover_coordinate / press_at.
|
||||
|
||||
Args:
|
||||
selector: CSS selectors joined by ' >>> ' to pierce shadow roots.
|
||||
@@ -421,7 +420,8 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
profile: Browser profile name (default: "default")
|
||||
|
||||
Returns:
|
||||
Dict with rect (CSS px) and physical rect (CSS px × DPR) of the element
|
||||
Dict with ``rect`` block (x, y, w, h, cx, cy) as fractions,
|
||||
plus ``cssWidth`` / ``cssHeight`` for reference.
|
||||
"""
|
||||
bridge = get_bridge()
|
||||
if not bridge or not bridge.is_connected:
|
||||
@@ -438,36 +438,27 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
return result
|
||||
|
||||
rect = result["rect"]
|
||||
physical_scale = _screenshot_scales.get(target_tab, 1.0)
|
||||
css_scale = _screenshot_css_scales.get(target_tab, 1.0)
|
||||
dpr = physical_scale / css_scale if css_scale else 1.0
|
||||
|
||||
cw, ch = await _ensure_viewport_size(target_tab)
|
||||
cw_f = float(cw) if cw > 0 else 1.0
|
||||
ch_f = float(ch) if ch > 0 else 1.0
|
||||
return {
|
||||
"ok": True,
|
||||
"selector": selector,
|
||||
"tag": rect.get("tag"),
|
||||
"css": {
|
||||
"x": rect["x"],
|
||||
"y": rect["y"],
|
||||
"w": rect["w"],
|
||||
"h": rect["h"],
|
||||
"cx": rect["cx"],
|
||||
"cy": rect["cy"],
|
||||
},
|
||||
"physical": {
|
||||
"x": round(rect["x"] * dpr, 1),
|
||||
"y": round(rect["y"] * dpr, 1),
|
||||
"w": round(rect["w"] * dpr, 1),
|
||||
"h": round(rect["h"] * dpr, 1),
|
||||
"cx": round(rect["cx"] * dpr, 1),
|
||||
"cy": round(rect["cy"] * dpr, 1),
|
||||
"rect": {
|
||||
"x": round(rect["x"] / cw_f, 4),
|
||||
"y": round(rect["y"] / ch_f, 4),
|
||||
"w": round(rect["w"] / cw_f, 4),
|
||||
"h": round(rect["h"] / ch_f, 4),
|
||||
"cx": round(rect["cx"] / cw_f, 4),
|
||||
"cy": round(rect["cy"] / ch_f, 4),
|
||||
},
|
||||
"cssWidth": cw,
|
||||
"cssHeight": ch,
|
||||
"note": (
|
||||
"Use css.cx/cy with browser_click_coordinate, "
|
||||
"browser_hover_coordinate, browser_press_at — "
|
||||
"CDP Input events operate in CSS pixels. "
|
||||
"physical.* is debug-only; feeding it to clicks "
|
||||
"lands them DPR× too far on HiDPI displays."
|
||||
"rect fields are fractions of the viewport (0..1). "
|
||||
"Pass rect.cx / rect.cy to browser_click_coordinate / "
|
||||
"hover_coordinate / press_at."
|
||||
),
|
||||
}
|
||||
|
||||
@@ -480,11 +471,10 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
"""
|
||||
Get the bounding rect of an element by CSS selector.
|
||||
|
||||
Supports '>>>' shadow-piercing selectors for overlay/shadow DOM content.
|
||||
Returns coordinates in CSS pixels (for clicks and DOM APIs); the
|
||||
physical-pixel variant is returned for debugging on HiDPI displays
|
||||
only — it must not be fed to click/hover/press tools, which use
|
||||
CSS pixels.
|
||||
Supports '>>>' shadow-piercing selectors for overlay/shadow DOM
|
||||
content. Returns the rect as **fractions of the viewport
|
||||
(0..1)** — the same coordinate space browser_click_coordinate
|
||||
/ hover_coordinate / press_at expect.
|
||||
|
||||
Args:
|
||||
selector: CSS selector, optionally with ' >>> ' to pierce shadow roots.
|
||||
@@ -493,7 +483,8 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
profile: Browser profile name (default: "default")
|
||||
|
||||
Returns:
|
||||
Dict with css and physical bounding rects
|
||||
Dict with ``rect`` block (x, y, w, h, cx, cy) as fractions,
|
||||
plus ``cssWidth`` / ``cssHeight`` for reference.
|
||||
"""
|
||||
bridge = get_bridge()
|
||||
if not bridge or not bridge.is_connected:
|
||||
@@ -510,36 +501,27 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
return result
|
||||
|
||||
rect = result["rect"]
|
||||
physical_scale = _screenshot_scales.get(target_tab, 1.0)
|
||||
css_scale = _screenshot_css_scales.get(target_tab, 1.0)
|
||||
dpr = physical_scale / css_scale if css_scale else 1.0
|
||||
|
||||
cw, ch = await _ensure_viewport_size(target_tab)
|
||||
cw_f = float(cw) if cw > 0 else 1.0
|
||||
ch_f = float(ch) if ch > 0 else 1.0
|
||||
return {
|
||||
"ok": True,
|
||||
"selector": selector,
|
||||
"tag": rect.get("tag"),
|
||||
"css": {
|
||||
"x": rect["x"],
|
||||
"y": rect["y"],
|
||||
"w": rect["w"],
|
||||
"h": rect["h"],
|
||||
"cx": rect["cx"],
|
||||
"cy": rect["cy"],
|
||||
},
|
||||
"physical": {
|
||||
"x": round(rect["x"] * dpr, 1),
|
||||
"y": round(rect["y"] * dpr, 1),
|
||||
"w": round(rect["w"] * dpr, 1),
|
||||
"h": round(rect["h"] * dpr, 1),
|
||||
"cx": round(rect["cx"] * dpr, 1),
|
||||
"cy": round(rect["cy"] * dpr, 1),
|
||||
"rect": {
|
||||
"x": round(rect["x"] / cw_f, 4),
|
||||
"y": round(rect["y"] / ch_f, 4),
|
||||
"w": round(rect["w"] / cw_f, 4),
|
||||
"h": round(rect["h"] / ch_f, 4),
|
||||
"cx": round(rect["cx"] / cw_f, 4),
|
||||
"cy": round(rect["cy"] / ch_f, 4),
|
||||
},
|
||||
"cssWidth": cw,
|
||||
"cssHeight": ch,
|
||||
"note": (
|
||||
"Use css.cx/cy with browser_click_coordinate, "
|
||||
"browser_hover_coordinate, browser_press_at — "
|
||||
"CDP Input events operate in CSS pixels. "
|
||||
"physical.* is debug-only; feeding it to clicks "
|
||||
"lands them DPR× too far on HiDPI displays."
|
||||
"rect fields are fractions of the viewport (0..1). "
|
||||
"Pass rect.cx / rect.cy to browser_click_coordinate / "
|
||||
"hover_coordinate / press_at."
|
||||
),
|
||||
}
|
||||
|
||||
@@ -547,6 +529,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
async def browser_snapshot(
|
||||
tab_id: int | None = None,
|
||||
profile: str | None = None,
|
||||
mode: Literal["default", "simple", "interactive"] = "default",
|
||||
) -> dict:
|
||||
"""
|
||||
Get an accessibility snapshot of the page.
|
||||
@@ -565,12 +548,16 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
Args:
|
||||
tab_id: Chrome tab ID (default: active tab)
|
||||
profile: Browser profile name (default: "default")
|
||||
mode: Snapshot filtering mode (default: "default")
|
||||
- "default": full accessibility tree
|
||||
- "simple": interactive + content nodes, skip unnamed structural nodes
|
||||
- "interactive": only interactive nodes (buttons, links, inputs, etc.)
|
||||
|
||||
Returns:
|
||||
Dict with the snapshot text tree, URL, and tab ID
|
||||
"""
|
||||
start = time.perf_counter()
|
||||
params = {"tab_id": tab_id, "profile": profile}
|
||||
params = {"tab_id": tab_id, "profile": profile, "mode": mode}
|
||||
|
||||
bridge = get_bridge()
|
||||
if not bridge or not bridge.is_connected:
|
||||
@@ -591,7 +578,7 @@ def register_inspection_tools(mcp: FastMCP) -> None:
|
||||
return result
|
||||
|
||||
try:
|
||||
snapshot_result = await bridge.snapshot(target_tab)
|
||||
snapshot_result = await bridge.snapshot(target_tab, mode=mode)
|
||||
log_tool_call(
|
||||
"browser_snapshot",
|
||||
params,
|
||||
|
||||
@@ -108,24 +108,31 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
button: Literal["left", "right", "middle"] = "left",
|
||||
) -> dict:
|
||||
"""
|
||||
Click at specific viewport coordinates (CSS pixels).
|
||||
Click at a FRACTION of the viewport (0..1, 0..1).
|
||||
|
||||
Chrome DevTools Protocol's Input.dispatchMouseEvent operates in
|
||||
**CSS pixels**, not physical pixels. If you have a screenshot
|
||||
image coordinate, convert it with ``browser_coords(x, y)`` and
|
||||
use the returned ``css_x`` / ``css_y`` — not ``physical_x/y``.
|
||||
On a DPR=2 display, feeding physical coordinates lands the click
|
||||
at 2× the intended position.
|
||||
Coordinates are **fractions of the viewport**, not pixels:
|
||||
``(0.5, 0.5)`` is the center, ``(0.1, 0.2)`` is 10 % from the
|
||||
left and 20 % from the top. Read a target's proportional
|
||||
position off ``browser_screenshot`` (or pass
|
||||
``rect.cx`` / ``rect.cy`` from ``browser_get_rect`` /
|
||||
``browser_shadow_query`` directly — they return fractions too).
|
||||
|
||||
Fractions are used because every vision model resizes or tiles
|
||||
images differently (Claude ~1.15 MP target, GPT-4o 512-px
|
||||
tiles, etc.). Proportional positions survive every such
|
||||
transform; pixel coords do not.
|
||||
|
||||
Args:
|
||||
x: X coordinate in CSS pixels (viewport space)
|
||||
y: Y coordinate in CSS pixels (viewport space)
|
||||
x: X fraction of the viewport (0..1).
|
||||
y: Y fraction of the viewport (0..1).
|
||||
tab_id: Chrome tab ID (default: active tab)
|
||||
profile: Browser profile name (default: "default")
|
||||
button: Mouse button to click (left, right, middle)
|
||||
|
||||
Returns:
|
||||
Dict with click result
|
||||
Dict with click result, including ``focused_element``
|
||||
describing what the click focused. ``focused_element.rect``
|
||||
is also in fractions.
|
||||
"""
|
||||
start = time.perf_counter()
|
||||
params = {"x": x, "y": y, "tab_id": tab_id, "profile": profile, "button": button}
|
||||
@@ -148,18 +155,33 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
log_tool_call("browser_click_coordinate", params, result=result)
|
||||
return result
|
||||
|
||||
try:
|
||||
from .inspection import _screenshot_css_scales, _screenshot_scales
|
||||
# Pixel-input guard: legitimate fractions live in [0, 1]. Allow a
|
||||
# small overshoot tolerance for edge targets.
|
||||
if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
|
||||
result = {
|
||||
"ok": False,
|
||||
"error": (
|
||||
f"Coords ({x}, {y}) look like pixels. This tool expects "
|
||||
"fractions 0..1 of the viewport. Read the target's "
|
||||
"proportional position off browser_screenshot, or pass "
|
||||
"rect.cx / rect.cy from browser_get_rect / "
|
||||
"browser_shadow_query (they return fractions)."
|
||||
),
|
||||
}
|
||||
log_tool_call("browser_click_coordinate", params, result=result)
|
||||
return result
|
||||
|
||||
click_result = await bridge.click_coordinate(target_tab, x, y, button=button)
|
||||
try:
|
||||
from .inspection import _ensure_viewport_size
|
||||
|
||||
cw, ch = await _ensure_viewport_size(target_tab)
|
||||
css_x = x * cw
|
||||
css_y = y * ch
|
||||
click_result = await bridge.click_coordinate(target_tab, css_x, css_y, button=button)
|
||||
log_tool_call(
|
||||
"browser_click_coordinate",
|
||||
params,
|
||||
result={
|
||||
**click_result,
|
||||
"debug_stored_physicalScale": _screenshot_scales.get(target_tab, "unset"),
|
||||
"debug_stored_cssScale": _screenshot_css_scales.get(target_tab, "unset"),
|
||||
},
|
||||
result={**click_result, "cssWidth": cw, "cssHeight": ch},
|
||||
duration_ms=(time.perf_counter() - start) * 1000,
|
||||
)
|
||||
return click_result
|
||||
@@ -484,15 +506,16 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
profile: str | None = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Hover at CSS pixel coordinates without needing a CSS selector.
|
||||
Hover at a FRACTION of the viewport (0..1, 0..1).
|
||||
|
||||
Use this instead of browser_hover when the element is in an overlay,
|
||||
shadow DOM, or virtual-rendered component that isn't in the regular DOM.
|
||||
Pair with browser_coords to convert screenshot image positions to CSS pixels.
|
||||
``x`` / ``y`` are fractions of the viewport (``0.5`` = center);
|
||||
the tool converts to CSS px internally.
|
||||
|
||||
Args:
|
||||
x: CSS pixel X coordinate
|
||||
y: CSS pixel Y coordinate
|
||||
x: X fraction of the viewport (0..1).
|
||||
y: Y fraction of the viewport (0..1).
|
||||
tab_id: Chrome tab ID (default: active tab)
|
||||
profile: Browser profile name (default: "default")
|
||||
|
||||
@@ -520,8 +543,22 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
log_tool_call("browser_hover_coordinate", params, result=result)
|
||||
return result
|
||||
|
||||
if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
|
||||
result = {
|
||||
"ok": False,
|
||||
"error": (
|
||||
f"Coords ({x}, {y}) look like pixels. This tool expects "
|
||||
"fractions 0..1 of the viewport."
|
||||
),
|
||||
}
|
||||
log_tool_call("browser_hover_coordinate", params, result=result)
|
||||
return result
|
||||
|
||||
try:
|
||||
hover_result = await bridge.hover_coordinate(target_tab, x, y)
|
||||
from .inspection import _ensure_viewport_size
|
||||
|
||||
cw, ch = await _ensure_viewport_size(target_tab)
|
||||
hover_result = await bridge.hover_coordinate(target_tab, x * cw, y * ch)
|
||||
log_tool_call(
|
||||
"browser_hover_coordinate",
|
||||
params,
|
||||
@@ -548,16 +585,17 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
profile: str | None = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Move mouse to CSS pixel coordinates then press a key.
|
||||
Move mouse to a FRACTION of the viewport (0..1, 0..1), then press a key.
|
||||
|
||||
Use this instead of browser_press when the focused element is in an overlay
|
||||
or virtual-rendered component. Moving the mouse first routes the key event
|
||||
through native browser hit-testing instead of the DOM focus chain.
|
||||
Pair with browser_coords to convert screenshot image positions to CSS pixels.
|
||||
``x`` / ``y`` are fractions of the viewport; the tool converts
|
||||
to CSS px internally.
|
||||
|
||||
Args:
|
||||
x: CSS pixel X coordinate to position mouse
|
||||
y: CSS pixel Y coordinate to position mouse
|
||||
x: X fraction of the viewport (0..1).
|
||||
y: Y fraction of the viewport (0..1).
|
||||
key: Key to press (e.g. 'Enter', 'Space', 'Escape', 'ArrowDown')
|
||||
tab_id: Chrome tab ID (default: active tab)
|
||||
profile: Browser profile name (default: "default")
|
||||
@@ -586,8 +624,22 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
log_tool_call("browser_press_at", params, result=result)
|
||||
return result
|
||||
|
||||
if x > 1.5 or y > 1.5 or x < -0.1 or y < -0.1:
|
||||
result = {
|
||||
"ok": False,
|
||||
"error": (
|
||||
f"Coords ({x}, {y}) look like pixels. This tool expects "
|
||||
"fractions 0..1 of the viewport."
|
||||
),
|
||||
}
|
||||
log_tool_call("browser_press_at", params, result=result)
|
||||
return result
|
||||
|
||||
try:
|
||||
press_result = await bridge.press_key_at(target_tab, x, y, key)
|
||||
from .inspection import _ensure_viewport_size
|
||||
|
||||
cw, ch = await _ensure_viewport_size(target_tab)
|
||||
press_result = await bridge.press_key_at(target_tab, x * cw, y * ch, key)
|
||||
log_tool_call(
|
||||
"browser_press_at",
|
||||
params,
|
||||
|
||||
@@ -35,6 +35,23 @@ def _resolve_profile(profile: str | None) -> str:
|
||||
_EXTENSION_PATH = (Path(__file__).parent.parent.parent.parent.parent / "browser-extension").resolve()
|
||||
|
||||
|
||||
def _clear_profile_tab_caches(ctx: dict[str, Any]) -> None:
|
||||
"""Clear per-tab caches for every tab the profile knew about.
|
||||
|
||||
Individual tab closes go through ``bridge.close_tab`` which clears
|
||||
caches per-tab; context destroys close every tab at once without
|
||||
per-tab notifications, so we clear them here from the tracked set.
|
||||
"""
|
||||
tab_ids = ctx.get("tabs") or set()
|
||||
if not tab_ids:
|
||||
return
|
||||
from ..bridge import clear_tab_highlights
|
||||
from .inspection import clear_tab_state
|
||||
|
||||
clear_tab_state(tab_ids)
|
||||
clear_tab_highlights(tab_ids)
|
||||
|
||||
|
||||
async def shutdown_all_contexts() -> None:
|
||||
"""Close all active browser contexts. Called at GCU server shutdown."""
|
||||
if not _contexts:
|
||||
@@ -42,6 +59,7 @@ async def shutdown_all_contexts() -> None:
|
||||
bridge = get_bridge()
|
||||
for profile_name, ctx in list(_contexts.items()):
|
||||
group_id = ctx.get("groupId")
|
||||
_clear_profile_tab_caches(ctx)
|
||||
if group_id is not None and bridge and bridge.is_connected:
|
||||
try:
|
||||
await bridge.destroy_context(group_id)
|
||||
@@ -232,6 +250,7 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
|
||||
"groupId": group_id,
|
||||
"activeTabId": tab_id,
|
||||
"_seedTabId": tab_id, # reused by first browser_open call
|
||||
"tabs": {tab_id} if tab_id is not None else set(),
|
||||
}
|
||||
|
||||
logger.info(
|
||||
@@ -299,6 +318,9 @@ def register_lifecycle_tools(mcp: FastMCP) -> None:
|
||||
try:
|
||||
group_id = ctx.get("groupId")
|
||||
closed_tabs = 0
|
||||
# Clear per-tab caches before tearing down the group — once
|
||||
# destroyed we won't get per-tab close notifications.
|
||||
_clear_profile_tab_caches(ctx)
|
||||
if group_id is not None:
|
||||
result = await bridge.destroy_context(group_id)
|
||||
closed_tabs = result.get("closedTabs", 0)
|
||||
|
||||
@@ -134,6 +134,11 @@ def register_tab_tools(mcp: FastMCP) -> None:
|
||||
result = await bridge.create_tab(url=url, group_id=ctx.get("groupId"))
|
||||
tab_id = result.get("tabId")
|
||||
|
||||
# Track tab_ids so browser_stop can clear per-tab caches
|
||||
# for every tab in this profile at once.
|
||||
if tab_id is not None:
|
||||
ctx.setdefault("tabs", set()).add(tab_id)
|
||||
|
||||
# Update active tab if not background
|
||||
if not background and tab_id is not None:
|
||||
ctx["activeTabId"] = tab_id
|
||||
@@ -201,6 +206,12 @@ def register_tab_tools(mcp: FastMCP) -> None:
|
||||
try:
|
||||
await bridge.close_tab(target_tab)
|
||||
|
||||
# Forget the closed tab so ctx["tabs"] only reflects tabs
|
||||
# that could still get per-tab cache activity.
|
||||
tabs_set = ctx.get("tabs")
|
||||
if isinstance(tabs_set, set):
|
||||
tabs_set.discard(target_tab)
|
||||
|
||||
# Update active tab if we closed it
|
||||
if ctx.get("activeTabId") == target_tab:
|
||||
result = await bridge.list_tabs(ctx.get("groupId"))
|
||||
@@ -300,6 +311,7 @@ def register_tab_tools(mcp: FastMCP) -> None:
|
||||
active_tab_id = ctx.get("activeTabId")
|
||||
|
||||
closed = 0
|
||||
tabs_set = ctx.get("tabs") if isinstance(ctx.get("tabs"), set) else None
|
||||
for tab in tabs:
|
||||
tid = tab.get("id")
|
||||
if keep_active and tid == active_tab_id:
|
||||
@@ -307,6 +319,8 @@ def register_tab_tools(mcp: FastMCP) -> None:
|
||||
try:
|
||||
await bridge.close_tab(tid)
|
||||
closed += 1
|
||||
if tabs_set is not None and tid is not None:
|
||||
tabs_set.discard(tid)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@@ -139,7 +139,10 @@ def main() -> None:
|
||||
mcp.run(transport="stdio")
|
||||
else:
|
||||
logger.info(f"Starting GCU server on {args.host}:{args.port}")
|
||||
mcp.run(transport="http", host=args.host, port=args.port)
|
||||
# FastMCP.run() forwards kwargs to anyio.run() instead of the
|
||||
# transport, which breaks host/port for SSE. Invoke run_async
|
||||
# directly so the kwargs land on run_sse_async.
|
||||
asyncio.run(mcp.run_async(transport="sse", host=args.host, port=args.port))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user