fix: tool credential filter

This commit is contained in:
Timothy
2026-04-11 12:54:26 -07:00
parent b5e05fefae
commit eeb46a2b3e
6 changed files with 561 additions and 93 deletions
+9 -2
View File
@@ -19,11 +19,18 @@
"Bash(find /home/timothy/aden/hive/core/framework -name \"*.py\" -type f -exec grep -l \"FileConversationStore\\\\|class.*ConversationStore\" {} \\\\;)",
"Bash(find /home/timothy/aden/hive/core/framework -name \"*.py\" -exec grep -l \"run_parallel_workers\\\\|create_colony\" {} \\\\;)",
"Bash(awk '/^ async def execute\\\\\\(self, ctx: AgentContext\\\\\\)/,/^ async def [a-z_]+/ {print NR\": \"$0}' /home/timothy/aden/hive/core/framework/agent_loop/agent_loop.py)",
"Bash(grep -r \"max_concurrent_workers\\\\|max_depth\\\\|recursion\\\\|spawn.*bomb\" /home/timothy/aden/hive/core/framework/host/*.py)"
"Bash(grep -r \"max_concurrent_workers\\\\|max_depth\\\\|recursion\\\\|spawn.*bomb\" /home/timothy/aden/hive/core/framework/host/*.py)",
"Bash(wc -l /home/timothy/aden/hive/tools/src/gcu/browser/*.py /home/timothy/aden/hive/tools/src/gcu/browser/tools/*.py)",
"Bash(file /tmp/gcu_verify/*.png)",
"Bash(ps -eo pid,cmd)",
"Bash(ps -o pid,lstart,cmd -p 746640)",
"Bash(kill 746636)",
"Bash(ps -eo pid,lstart,cmd)"
],
"additionalDirectories": [
"/home/timothy/.hive/skills/writing-hive-skills",
"/tmp"
"/tmp",
"/home/timothy/.hive/skills"
]
},
"hooks": {
+2
View File
@@ -85,6 +85,7 @@ from .template import TemplateResolver
from .validation import (
CredentialStatus,
CredentialValidationResult,
compute_unavailable_tools,
ensure_credential_key_env,
validate_agent_credentials,
)
@@ -150,6 +151,7 @@ __all__ = [
# Validation
"ensure_credential_key_env",
"validate_agent_credentials",
"compute_unavailable_tools",
"CredentialStatus",
"CredentialValidationResult",
# Interactive setup
+40
View File
@@ -236,6 +236,46 @@ def _presync_aden_tokens(credential_specs: dict, *, force: bool = False) -> None
)
def compute_unavailable_tools(nodes: list) -> tuple[set[str], list[str]]:
"""Return (tool_names_to_drop, human_messages).
Runs credential validation *without* raising, collects every tool
bound to a failed credential (missing / invalid / Aden-not-connected
and no alternative provider available), and returns the set of tool
names that should be silently dropped from the worker's effective
tool list.
Use this at every worker-spawn preflight so missing credentials
filter tools out of the graph instead of hard-failing the whole
spawn. Only affects non-MCP tools the MCP admission gate
(``_build_mcp_admission_gate``) already handles MCP tools at
registration time.
"""
try:
result = validate_agent_credentials(nodes, verify=False, raise_on_error=False)
except Exception as exc:
logger.debug("compute_unavailable_tools: validation raised: %s", exc)
return set(), []
drop: set[str] = set()
messages: list[str] = []
for status in result.failed:
if not status.tools:
continue
drop.update(status.tools)
reason = "missing"
if status.aden_not_connected:
reason = "aden_not_connected"
elif status.available and status.valid is False:
reason = "invalid"
messages.append(
f"{status.env_var} ({reason}) → drops {len(status.tools)} tool(s): "
f"{', '.join(status.tools[:6])}"
+ (f" +{len(status.tools) - 6} more" if len(status.tools) > 6 else "")
)
return drop, messages
def validate_agent_credentials(
nodes: list,
quiet: bool = False,
+77 -18
View File
@@ -44,7 +44,7 @@ from typing import TYPE_CHECKING, Any
from framework.credentials.models import CredentialError
from framework.host.event_bus import AgentEvent, EventType
from framework.loader.preload_validation import credential_errors_to_json, validate_credentials
from framework.loader.preload_validation import credential_errors_to_json
from framework.server.app import validate_agent_path
from framework.tools.flowchart_utils import (
FLOWCHART_TYPES,
@@ -3875,24 +3875,50 @@ def register_queen_lifecycle_tools(
)
try:
# Pre-flight: validate credentials and resync MCP servers.
# Still uses the legacy AgentHost handles because that's
# where credentials live; the actual run is via colony.
# Pre-flight: compute the set of tools whose credentials are
# NOT currently available, and resync MCP servers. We do NOT
# hard-fail on missing credentials anymore — instead we drop
# the affected tools from the worker's spawn_tools list a
# few lines below. Hard-failing here caused unrelated tools
# (e.g. GitHub tools leaking into a LinkedIn worker config)
# to block the whole spawn with a CredentialError; the fix
# is to treat unset credentials as "drop these tools" rather
# than "abort the worker".
#
# Note: the MCP admission gate (_build_mcp_admission_gate in
# tool_registry.py) already filters MCP tools at registration
# time. This preflight covers the non-MCP path — tools.py
# discoveries via discover_from_module — which has no
# credential gate of its own.
loop = asyncio.get_running_loop()
unavailable_tools: set[str] = set()
async def _preflight():
cred_error: CredentialError | None = None
nonlocal unavailable_tools
try:
await loop.run_in_executor(
from framework.credentials.validation import compute_unavailable_tools
drop, messages = await loop.run_in_executor(
None,
lambda: validate_credentials(
legacy.graph.nodes,
interactive=False,
skip=False,
),
lambda: compute_unavailable_tools(legacy.graph.nodes),
)
unavailable_tools = drop
if drop:
logger.warning(
"run_agent_with_input: dropping %d tool(s) with "
"unavailable credentials from worker spawn: %s",
len(drop),
"; ".join(messages),
)
except Exception as exc:
# Validation itself failing (not a credential failure —
# a code error in the validator) should not block the
# spawn. Log and proceed as if nothing was dropped.
logger.warning(
"compute_unavailable_tools raised, proceeding without "
"credential-based tool filtering: %s",
exc,
)
except CredentialError as e:
cred_error = e
runner = getattr(session, "runner", None)
if runner:
@@ -3904,9 +3930,6 @@ def register_queen_lifecycle_tools(
except Exception as e:
logger.warning("MCP resync failed: %s", e)
if cred_error is not None:
raise cred_error
try:
await asyncio.wait_for(_preflight(), timeout=_START_PREFLIGHT_TIMEOUT)
except TimeoutError:
@@ -3914,8 +3937,6 @@ def register_queen_lifecycle_tools(
"run_agent_with_input preflight timed out after %ds — proceeding",
_START_PREFLIGHT_TIMEOUT,
)
except CredentialError:
raise # handled below
# Build a per-spawn AgentSpec that mirrors the loaded
# worker's entry-node identity. This is what makes the
@@ -3944,6 +3965,24 @@ def register_queen_lifecycle_tools(
else []
)
# Drop any tool whose credential isn't available (GitHub
# tools when GITHUB_TOKEN is unset, etc). The preflight
# above populated ``unavailable_tools``; apply the filter
# HERE — before the AgentSpec is built — so the worker
# only sees tools it can actually run.
dropped_from_names: list[str] = []
if unavailable_tools:
original = worker_tool_names
worker_tool_names = [t for t in original if t not in unavailable_tools]
dropped_from_names = [t for t in original if t in unavailable_tools]
if dropped_from_names:
logger.warning(
"run_agent_with_input: dropped %d tool(s) from worker "
"AgentSpec due to unavailable credentials: %s",
len(dropped_from_names),
", ".join(dropped_from_names),
)
spawn_spec = AgentSpec(
id=f"loaded_worker:{getattr(graph, 'id', 'unknown')}",
name=getattr(graph, "id", "loaded_worker"),
@@ -3962,6 +4001,26 @@ def register_queen_lifecycle_tools(
spawn_tools = list(getattr(legacy, "_tools", []) or [])
spawn_tool_executor = getattr(legacy, "_tool_executor", None)
# Same credential-based filter on the live Tool objects
# passed to the worker. Without this the worker would still
# receive the GitHub tool definitions in its registry —
# it just wouldn't see them in its AgentSpec, so the LLM
# wouldn't know to use them. Dropping from both lists
# makes the filter complete.
if unavailable_tools:
before = len(spawn_tools)
spawn_tools = [
t for t in spawn_tools
if getattr(t, "name", None) not in unavailable_tools
]
dropped_count = before - len(spawn_tools)
if dropped_count:
logger.info(
"run_agent_with_input: dropped %d tool object(s) from "
"spawn_tools (unavailable credentials)",
dropped_count,
)
worker_ids = await colony.spawn(
task=task,
count=1,
+393 -68
View File
@@ -123,6 +123,21 @@ class BeelineBridge:
logger.warning("Bridge status server could not start on port %d: %s", status_port, e)
async def stop(self) -> None:
# Cancel in-flight bridge requests so any caller stuck in _send
# sees CancelledError immediately instead of waiting the full
# 30s timeout. Mirrors the cleanup in _handle_connection's
# disconnect branch so both exit paths behave the same.
for fut in self._pending.values():
if not fut.done():
fut.cancel()
self._pending.clear()
# Drop CDP attach cache — next run must re-attach fresh.
self._cdp_attached.clear()
# Drop highlight state — stale entries would otherwise carry
# over into a subsequent run and confuse screenshot annotation.
_interaction_highlights.clear()
self._ws = None
if self._server:
self._server.close()
try:
@@ -222,7 +237,14 @@ class BeelineBridge:
fut.cancel()
self._pending.clear()
async def _send(self, type_: str, **params) -> dict:
# Default wait on a bridge command. Callers with known-slow ops
# (full-page screenshots on slow networks, AX tree on huge pages)
# can pass a longer value via _send(..., timeout=...). Using the
# same default as the old hard-coded value so existing call sites
# don't regress.
_DEFAULT_SEND_TIMEOUT_S: float = 30.0
async def _send(self, type_: str, *, timeout: float | None = None, **params) -> dict:
"""Send a command to the extension and wait for the result."""
if not self._ws:
raise RuntimeError("Extension not connected")
@@ -231,27 +253,58 @@ class BeelineBridge:
fut: asyncio.Future = asyncio.get_event_loop().create_future()
self._pending[msg_id] = fut
start = time.perf_counter()
effective_timeout = timeout if timeout is not None else self._DEFAULT_SEND_TIMEOUT_S
log_bridge_message("send", type_, msg_id=msg_id, params=params)
try:
await self._ws.send(json.dumps({"id": msg_id, "type": type_, **params}))
result = await asyncio.wait_for(fut, timeout=30.0)
result = await asyncio.wait_for(fut, timeout=effective_timeout)
duration_ms = (time.perf_counter() - start) * 1000
log_bridge_message("send", type_, msg_id=msg_id, result=result, duration_ms=duration_ms)
return result
except TimeoutError:
self._pending.pop(msg_id, None)
log_bridge_message("send", type_, msg_id=msg_id, error="timeout")
raise RuntimeError(f"Bridge command '{type_}' timed out") from None
# Include which CDP method (if any) so the caller can see
# what actually hung — the generic 'cdp' type is useless
# when ten different CDP calls use the same type.
detail = f" method={params.get('method')}" if params.get("method") else ""
raise RuntimeError(
f"Bridge command '{type_}'{detail} timed out after {effective_timeout:.0f}s"
) from None
except BaseException:
# CancelledError or any other exception — remove stale future so a late
# response from the extension doesn't try to resolve a cancelled future.
self._pending.pop(msg_id, None)
raise
# Substrings that indicate Chrome detached the debugger out from
# under us (tab closed, user opened DevTools, cross-origin nav).
# Our in-memory _cdp_attached set is now stale; next call should
# re-attach rather than reporting a cryptic "Target not found".
_CDP_DEAD_SESSION_MARKERS = (
"target closed",
"target not found",
"not attached",
"session closed",
"inspector already attached",
"no target with given id",
)
def _is_cdp_dead_session(self, exc: BaseException) -> bool:
msg = str(exc).lower()
return any(m in msg for m in self._CDP_DEAD_SESSION_MARKERS)
async def _cdp(self, tab_id: int, method: str, params: dict | None = None) -> dict:
"""Send a CDP command to a tab."""
"""Send a CDP command to a tab.
On a dead-session error (Chrome detached externally tab closed,
DevTools opened, cross-origin nav), evict the stale attach
cache entry, reattach, and retry once. Without this the Python
side would keep assuming it's attached and every subsequent call
would hit the same error until someone restarted the bridge.
"""
start = time.perf_counter()
try:
result = await self._send("cdp", tabId=tab_id, method=method, params=params or {})
@@ -261,6 +314,33 @@ class BeelineBridge:
except Exception as e:
duration_ms = (time.perf_counter() - start) * 1000
log_cdp_command(tab_id, method, params, error=str(e), duration_ms=duration_ms)
if self._is_cdp_dead_session(e):
logger.info(
"CDP session for tab %d looks dead (%s) — re-attaching and retrying",
tab_id,
str(e)[:120],
)
self._cdp_attached.discard(tab_id)
try:
reattach = await self._send("cdp.attach", tabId=tab_id)
if reattach.get("ok"):
self._cdp_attached.add(tab_id)
retry_start = time.perf_counter()
result = await self._send(
"cdp", tabId=tab_id, method=method, params=params or {}
)
log_cdp_command(
tab_id,
method,
params,
result,
duration_ms=(time.perf_counter() - retry_start) * 1000,
)
return result
except Exception as retry_exc:
logger.debug(
"CDP reattach+retry for tab %d failed: %s", tab_id, retry_exc
)
raise
async def _try_enable_domain(self, tab_id: int, domain: str) -> None:
@@ -311,7 +391,14 @@ class BeelineBridge:
async def close_tab(self, tab_id: int) -> dict:
"""Close a tab by ID."""
return await self._send("tab.close", tabId=tab_id)
result = await self._send("tab.close", tabId=tab_id)
# Drop per-tab state — the id may be reused by Chrome much
# later, and carrying a stale highlight or "attached" flag
# forward would misannotate screenshots or skip a needed
# reattach on the reused id.
self._cdp_attached.discard(tab_id)
_interaction_highlights.pop(tab_id, None)
return result
async def list_tabs(self, group_id: int | None = None) -> dict:
"""List tabs, optionally filtered by group.
@@ -361,6 +448,11 @@ class BeelineBridge:
if wait_until not in VALID_WAIT_UNTIL:
wait_until = "load"
# Drop the stale interaction highlight before loading a new
# page — otherwise the next screenshot will annotate the new
# page with a rect from the previous page's coordinate system.
_interaction_highlights.pop(tab_id, None)
# Attach debugger if needed
await self.cdp_attach(tab_id)
@@ -382,9 +474,11 @@ class BeelineBridge:
"Runtime.evaluate",
{"expression": "document.readyState", "returnByValue": True},
)
ready_state = (
(eval_result or {}).get("result", {}).get("result", {}).get("value", "")
)
# _cdp returns the CDP response body; Runtime.evaluate shape
# is {"result": {"type": ..., "value": ...}} — one "result"
# hop, not two. The extra hop was always returning "" and
# this entire lifecycle loop was running until the deadline.
ready_state = (eval_result or {}).get("result", {}).get("value", "")
if wait_until == "domcontentloaded" and ready_state in ("interactive", "complete"):
break
@@ -416,17 +510,31 @@ class BeelineBridge:
return {
"ok": True,
"tabId": tab_id,
"url": (url_result or {}).get("result", {}).get("result", {}).get("value", ""),
"title": (title_result or {}).get("result", {}).get("result", {}).get("value", ""),
"url": (url_result or {}).get("result", {}).get("value", ""),
"title": (title_result or {}).get("result", {}).get("value", ""),
}
async def go_back(self, tab_id: int) -> dict:
"""Navigate back in history."""
"""Navigate back in history.
Uses ``history.back()`` via Runtime.evaluate modern Chrome CDP
no longer exposes ``Page.goBack`` / ``Page.goForward`` (removed
in favour of ``Page.navigateToHistoryEntry``, which requires
first fetching the history list). ``history.back()`` is simpler,
works across every Chrome version, and matches what the user
expects when they call ``browser_go_back``.
"""
_interaction_highlights.pop(tab_id, None)
await self.cdp_attach(tab_id)
await self._cdp(tab_id, "Page.enable")
await self._cdp(tab_id, "Page.goBack")
# Get current URL
await self._cdp(
tab_id,
"Runtime.evaluate",
{"expression": "history.back()", "returnByValue": True},
)
# Give the browser a beat to commit the navigation before we
# read the new URL.
await asyncio.sleep(0.3)
result = await self._cdp(
tab_id,
"Runtime.evaluate",
@@ -435,15 +543,20 @@ class BeelineBridge:
return {
"ok": True,
"action": "back",
"url": (result or {}).get("result", {}).get("result", {}).get("value", ""),
"url": (result or {}).get("result", {}).get("value", ""),
}
async def go_forward(self, tab_id: int) -> dict:
"""Navigate forward in history."""
"""Navigate forward in history. See go_back() for why we use JS."""
_interaction_highlights.pop(tab_id, None)
await self.cdp_attach(tab_id)
await self._cdp(tab_id, "Page.enable")
await self._cdp(tab_id, "Page.goForward")
await self._cdp(
tab_id,
"Runtime.evaluate",
{"expression": "history.forward()", "returnByValue": True},
)
await asyncio.sleep(0.3)
result = await self._cdp(
tab_id,
"Runtime.evaluate",
@@ -452,11 +565,12 @@ class BeelineBridge:
return {
"ok": True,
"action": "forward",
"url": (result or {}).get("result", {}).get("result", {}).get("value", ""),
"url": (result or {}).get("result", {}).get("value", ""),
}
async def reload(self, tab_id: int) -> dict:
"""Reload the page."""
_interaction_highlights.pop(tab_id, None)
await self.cdp_attach(tab_id)
await self._cdp(tab_id, "Page.enable")
await self._cdp(tab_id, "Page.reload")
@@ -469,7 +583,7 @@ class BeelineBridge:
return {
"ok": True,
"action": "reload",
"url": (result or {}).get("result", {}).get("result", {}).get("value", ""),
"url": (result or {}).get("result", {}).get("value", ""),
}
# ── Interaction ────────────────────────────────────────────────────────────
@@ -759,75 +873,150 @@ class BeelineBridge:
clear_first: bool = True,
delay_ms: int = 0,
timeout_ms: int = 30000,
use_insert_text: bool = True,
) -> dict:
"""Type text into an element.
Uses JavaScript focus for reliability, then CDP key events.
Routes through a real CDP pointer click on the target rect BEFORE
inserting text. This is critical for rich-text editors (Draft.js,
Lexical, ProseMirror, React-controlled contenteditable): those
frameworks only register input as "real" after seeing a native
focus event sourced from a real pointer interaction a
JS-sourced ``el.focus()`` is ignored, and the submit button
stays disabled because the framework's internal state never
updates. Sending a CDP click first fires the real
pointerdown/pointerup/click/focus sequence that every modern
framework listens to.
After clicking, we insert text via ``Input.insertText`` by
default (``use_insert_text=True``). insertText is a dedicated
CDP method that asks the browser to commit text into the
focused element as if IME just committed it it works
cleanly on rich editors where per-character keyDown events
would otherwise be eaten or mis-timed (empirically verified
against LinkedIn's Lexical message composer 2026-04-11).
Playwright uses the same approach under the hood.
Set ``use_insert_text=False`` to get the old per-character
keyDown/keyUp path when an editor needs precise keystroke
timing (autocomplete triggers, code editors that fire on
specific chars, ``delay_ms`` typing animations).
"""
await self.cdp_attach(tab_id)
await self._try_enable_domain(tab_id, "DOM")
await self._try_enable_domain(tab_id, "Input")
await self._try_enable_domain(tab_id, "Runtime")
# First, scroll into view and focus via JavaScript (more reliable than CDP)
# Find + scroll + (optionally) clear via JS. We still need the
# rect, and clearing via `.value = ''` / `.textContent = ''`
# is the most reliable way to reset pre-existing content.
focus_script = f"""
(function() {{
const el = document.querySelector({json.dumps(selector)});
if (!el) return false;
if (!el) return null;
// Scroll into view
// Scroll into view so the click lands in-viewport.
el.scrollIntoView({{ block: 'center' }});
// Focus the element
el.focus();
// Clear if requested
// Clear if requested.
if ({str(clear_first).lower()}) {{
if (el.value !== undefined) {{
el.value = '';
// Nudge React's onChange — the framework reads
// .value via a setter hook, and without firing
// an input event the component state remains
// stale after our value assignment.
el.dispatchEvent(new Event('input', {{bubbles: true}}));
}} else if (el.isContentEditable) {{
el.textContent = '';
el.dispatchEvent(new Event('input', {{bubbles: true}}));
}}
}}
return true;
const r = el.getBoundingClientRect();
return {{
x: r.left + r.width / 2,
y: r.top + r.height / 2,
w: r.width,
h: r.height,
}};
}})();
"""
focus_result = await self.evaluate(tab_id, focus_script)
success = (focus_result or {}).get("result", False)
rect = (focus_result or {}).get("result")
if not success:
# Element not found - wait and retry
if not rect:
# Element not found wait + retry until timeout.
deadline = asyncio.get_event_loop().time() + timeout_ms / 1000
while asyncio.get_event_loop().time() < deadline:
result = await self.evaluate(tab_id, focus_script)
if result and (result or {}).get("result", False):
success = True
rect = (result or {}).get("result") if result else None
if rect:
break
await asyncio.sleep(0.1)
if not success:
if not rect:
return {"ok": False, "error": f"Element not found: {selector}"}
await asyncio.sleep(0.05) # Wait for focus to take effect
if not rect.get("w") or not rect.get("h"):
return {
"ok": False,
"error": f"Element has zero dimensions, can't click to focus: {selector}",
}
# Type each character using CDP key events
for char in text:
# Dispatch key down
await self._cdp(
tab_id,
"Input.dispatchKeyEvent",
{"type": "keyDown", "text": char},
)
# Dispatch key up
await self._cdp(
tab_id,
"Input.dispatchKeyEvent",
{"type": "keyUp", "text": char},
)
if delay_ms > 0:
await asyncio.sleep(delay_ms / 1000)
# Fire a real CDP pointer click at the element's center. This is
# what unblocks rich-text editors — JS el.focus() is not enough.
click_x = rect["x"]
click_y = rect["y"]
await self._cdp(
tab_id,
"Input.dispatchMouseEvent",
{"type": "mousePressed", "x": click_x, "y": click_y, "button": "left", "clickCount": 1},
)
await self._cdp(
tab_id,
"Input.dispatchMouseEvent",
{"type": "mouseReleased", "x": click_x, "y": click_y, "button": "left", "clickCount": 1},
)
await asyncio.sleep(0.15) # Let focus / editor-init animations settle.
if use_insert_text and delay_ms <= 0:
# CDP Input.insertText is the most reliable way to insert
# text into a rich-text editor. It bypasses the keyboard
# event pipeline entirely and commits text into the focused
# element as if IME just committed it. Works on plain
# <input>/<textarea>, contenteditable, Lexical, Draft.js,
# ProseMirror, Monaco textarea buffers — verified empirically
# against LinkedIn's message composer (Lexical) on 2026-04-11
# where the per-char keyDown path left the editor empty.
await self._cdp(tab_id, "Input.insertText", {"text": text})
else:
# Fallback path: per-character keyDown/keyUp with full key,
# code, and text fields. Used when the caller explicitly
# wants per-keystroke dispatch (autocomplete testing, code
# editors that fire on specific chars, animated typing
# with ``delay_ms``). Populating ``code`` for ASCII is
# needed so frameworks that branch on ``event.code`` see
# the right values.
for char in text:
key_params: dict[str, Any] = {
"type": "keyDown",
"text": char,
"key": char,
}
if len(char) == 1 and char.isalpha():
key_params["code"] = f"Key{char.upper()}"
elif len(char) == 1 and char.isdigit():
key_params["code"] = f"Digit{char}"
await self._cdp(tab_id, "Input.dispatchKeyEvent", key_params)
key_up = {"type": "keyUp", "key": char}
if "code" in key_params:
key_up["code"] = key_params["code"]
await self._cdp(tab_id, "Input.dispatchKeyEvent", key_up)
if delay_ms > 0:
await asyncio.sleep(delay_ms / 1000)
# Highlight the element that was typed into
rect_result = await self.evaluate(
@@ -844,12 +1033,47 @@ class BeelineBridge:
)
return {"ok": True, "action": "type", "selector": selector, "length": len(text)}
async def press_key(self, tab_id: int, key: str, selector: str | None = None) -> dict:
"""Press a keyboard key.
# CDP Input.dispatchKeyEvent modifiers bitmask.
_CDP_MODIFIERS = {"alt": 1, "ctrl": 2, "control": 2, "meta": 4, "cmd": 4, "shift": 8}
# How Chrome expects each modifier key as its OWN keyDown event —
# name, code, and Windows virtual key code. Dispatched before the
# main key so Chrome sees the modifier as "held" during the main
# event, which is what actually triggers browser shortcuts like
# Ctrl+A, Cmd+L, Shift+Tab.
_MODIFIER_KEYS = {
"alt": {"key": "Alt", "code": "AltLeft", "windowsVirtualKeyCode": 18},
"ctrl": {"key": "Control", "code": "ControlLeft", "windowsVirtualKeyCode": 17},
"control": {"key": "Control", "code": "ControlLeft", "windowsVirtualKeyCode": 17},
"meta": {"key": "Meta", "code": "MetaLeft", "windowsVirtualKeyCode": 91},
"cmd": {"key": "Meta", "code": "MetaLeft", "windowsVirtualKeyCode": 91},
"shift": {"key": "Shift", "code": "ShiftLeft", "windowsVirtualKeyCode": 16},
}
def _cdp_modifier_mask(self, modifiers: list[str] | None) -> int:
if not modifiers:
return 0
mask = 0
for m in modifiers:
mask |= self._CDP_MODIFIERS.get(m.lower(), 0)
return mask
async def press_key(
self,
tab_id: int,
key: str,
selector: str | None = None,
modifiers: list[str] | None = None,
) -> dict:
"""Press a keyboard key, optionally with modifier keys held.
Args:
key: Key name like 'Enter', 'Tab', 'Escape', 'ArrowDown', etc.
selector: Optional selector to focus first
modifiers: Optional list of modifier keys to hold while pressing
``key``. Accepted values: "alt", "ctrl"/"control", "meta"/"cmd",
"shift". Example: ``modifiers=["ctrl"]`` Ctrl+key, which
enables shortcuts like Ctrl+A, Ctrl+L, Cmd+Enter, Shift+Tab.
"""
await self.cdp_attach(tab_id)
await self._try_enable_domain(tab_id, "Input")
@@ -882,19 +1106,110 @@ class BeelineBridge:
}
text, key_name = key_map.get(key, (key, key))
mod_mask = self._cdp_modifier_mask(modifiers)
await self._cdp(
tab_id,
"Input.dispatchKeyEvent",
{"type": "keyDown", "key": key_name, "text": text if text else None},
)
await self._cdp(
tab_id,
"Input.dispatchKeyEvent",
{"type": "keyUp", "key": key_name, "text": text if text else None},
)
# With modifiers held, suppress the printable text so that
# e.g. Ctrl+A doesn't also type the character "a" into the
# focused field (CDP will still fire the shortcut).
effective_text = text if (text and mod_mask == 0) else None
return {"ok": True, "action": "press", "key": key}
# Compute ``code`` and ``windowsVirtualKeyCode`` for the main
# key. These are MANDATORY for Chrome's shortcut dispatcher —
# without them, Ctrl+A etc. reach the DOM with ``code=""`` and
# ``which=0`` and Chrome doesn't recognise them as shortcuts.
# Verified empirically on chrome 131 against a real input.
main_code: str | None = None
main_vk: int | None = None
special_vk = {
"Enter": (13, "Enter"),
"Tab": (9, "Tab"),
"Escape": (27, "Escape"),
"Backspace": (8, "Backspace"),
"Delete": (46, "Delete"),
"ArrowUp": (38, "ArrowUp"),
"ArrowDown": (40, "ArrowDown"),
"ArrowLeft": (37, "ArrowLeft"),
"ArrowRight": (39, "ArrowRight"),
"Home": (36, "Home"),
"End": (35, "End"),
"PageUp": (33, "PageUp"),
"PageDown": (34, "PageDown"),
}
if key_name in special_vk:
main_vk, main_code = special_vk[key_name]
elif len(key_name) == 1 and key_name.isalpha():
main_code = f"Key{key_name.upper()}"
main_vk = ord(key_name.upper()) # 'A' = 65 ... 'Z' = 90
elif len(key_name) == 1 and key_name.isdigit():
main_code = f"Digit{key_name}"
main_vk = ord(key_name) # '0' = 48 ... '9' = 57
# Press each modifier as a separate keyDown BEFORE the main
# key. Sending ``modifiers: mask`` on the main key alone isn't
# enough — Chrome's shortcut dispatcher looks for a held
# modifier event, not just a flag. Matches the Playwright /
# Puppeteer sequence. Release modifiers in reverse order after
# the main key so the "held" state is correct throughout.
pressed_mods: list[dict] = []
if modifiers:
for m in modifiers:
spec = self._MODIFIER_KEYS.get(m.lower())
if spec is None:
continue
await self._cdp(
tab_id,
"Input.dispatchKeyEvent",
{
"type": "keyDown",
"key": spec["key"],
"code": spec["code"],
"windowsVirtualKeyCode": spec["windowsVirtualKeyCode"],
"modifiers": mod_mask,
},
)
pressed_mods.append(spec)
main_down: dict[str, Any] = {
# Use rawKeyDown when a modifier is held so Chrome skips
# text insertion and routes the event to the shortcut
# dispatcher. For plain press_key without modifiers we can
# use regular keyDown.
"type": "rawKeyDown" if mod_mask else "keyDown",
"key": key_name,
"text": effective_text,
"modifiers": mod_mask,
}
main_up: dict[str, Any] = {
"type": "keyUp",
"key": key_name,
"text": effective_text,
"modifiers": mod_mask,
}
if main_code is not None:
main_down["code"] = main_code
main_up["code"] = main_code
if main_vk is not None:
main_down["windowsVirtualKeyCode"] = main_vk
main_up["windowsVirtualKeyCode"] = main_vk
await self._cdp(tab_id, "Input.dispatchKeyEvent", main_down)
await self._cdp(tab_id, "Input.dispatchKeyEvent", main_up)
# Release modifiers in reverse order.
for spec in reversed(pressed_mods):
await self._cdp(
tab_id,
"Input.dispatchKeyEvent",
{
"type": "keyUp",
"key": spec["key"],
"code": spec["code"],
"windowsVirtualKeyCode": spec["windowsVirtualKeyCode"],
"modifiers": 0,
},
)
return {"ok": True, "action": "press", "key": key, "modifiers": modifiers or []}
# Shared JS snippet: shadow-piercing querySelector via ">>>" separator
_SHADOW_QUERY_JS = """
@@ -916,9 +1231,15 @@ class BeelineBridge:
Example: '#interop-outlet >>> #ember37 >>> p'
"""
await self.cdp_attach(tab_id)
# IMPORTANT: the whole script must be a single IIFE so that
# bridge.evaluate() detects it as "already wrapped" and returns
# its value. If you let evaluate() re-wrap a script that
# starts with a function declaration, the outer wrapper
# discards the inner IIFE's return and you always get None —
# which is exactly the bug this code had until 2026-04-11.
script = (
f"{self._SHADOW_QUERY_JS}"
f"(function(){{"
f"{self._SHADOW_QUERY_JS}"
f"const el=_shadowQuery({json.dumps(selector)});"
f"if(!el)return null;"
f"const r=el.getBoundingClientRect();"
@@ -1945,7 +2266,10 @@ class BeelineBridge:
"Runtime.evaluate",
{"expression": script, "returnByValue": True},
)
found = (result or {}).get("result", {}).get("result", {}).get("value", False)
# One "result" hop — see navigate() comment. This was silently
# returning False on every poll, so wait_for_selector always
# reported "not found" after the full timeout.
found = (result or {}).get("result", {}).get("value", False)
if found:
return {"ok": True, "selector": selector}
await asyncio.sleep(0.1)
@@ -1969,7 +2293,8 @@ class BeelineBridge:
"Runtime.evaluate",
{"expression": script, "returnByValue": True},
)
found = (result or {}).get("result", {}).get("result", {}).get("value", False)
# Same unwrap bug as wait_for_selector.
found = (result or {}).get("result", {}).get("value", False)
if found:
return {"ok": True, "text": text}
await asyncio.sleep(0.1)
+40 -5
View File
@@ -178,18 +178,37 @@ def register_interaction_tools(mcp: FastMCP) -> None:
delay_ms: int = 0,
clear_first: bool = True,
timeout_ms: int = 30000,
use_insert_text: bool = True,
) -> dict:
"""
Type text into an input element.
Automatically routes through a real CDP pointer click on the
element before inserting text so that rich-text editors like
Lexical (Gmail, LinkedIn DMs), Draft.js (X compose), and
ProseMirror (Reddit) see a native focus event and enable their
submit buttons. See the gcu-browser skill for the full "click-
then-type" pattern.
By default uses CDP Input.insertText which is the most reliable
way to insert text into rich editors. Set
``use_insert_text=False`` to fall back to per-character
keyDown/keyUp events (needed only for code editors that fire
on specific keystrokes, or when ``delay_ms`` typing animation
is required).
Args:
selector: CSS selector for the input element
text: Text to type
tab_id: Chrome tab ID (default: active tab)
profile: Browser profile name (default: "default")
delay_ms: Delay between keystrokes in ms (default: 0)
delay_ms: Delay between keystrokes in ms (default: 0).
Forces the per-keystroke fallback when > 0.
clear_first: Clear existing text before typing (default: True)
timeout_ms: Timeout waiting for element (default: 30000)
use_insert_text: Use CDP Input.insertText (default: True) for
reliable insertion into rich-text editors.
Set False for per-keystroke dispatch.
Returns:
Dict with type result
@@ -223,6 +242,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
clear_first=clear_first,
delay_ms=delay_ms,
timeout_ms=timeout_ms,
use_insert_text=use_insert_text,
)
log_tool_call(
"browser_type",
@@ -277,21 +297,34 @@ def register_interaction_tools(mcp: FastMCP) -> None:
selector: str | None = None,
tab_id: int | None = None,
profile: str | None = None,
modifiers: list[str] | None = None,
) -> dict:
"""
Press a keyboard key.
Press a keyboard key, optionally with modifier keys held.
Args:
key: Key to press (e.g., 'Enter', 'Tab', 'Escape', 'ArrowDown')
key: Key to press (e.g., 'Enter', 'Tab', 'Escape', 'ArrowDown',
or a character like 'a')
selector: Focus element first (optional)
tab_id: Chrome tab ID (default: active tab)
profile: Browser profile name (default: "default")
modifiers: Hold these modifier keys while pressing ``key``. Accepted
values (case-insensitive): "alt", "ctrl"/"control", "meta"/"cmd",
"shift". Examples: ``modifiers=["ctrl"], key="a"`` = Ctrl+A
(select all); ``modifiers=["shift"], key="Tab"`` = Shift+Tab;
``modifiers=["meta"], key="Enter"`` = Cmd+Enter.
Returns:
Dict with press result
"""
start = time.perf_counter()
params = {"key": key, "selector": selector, "tab_id": tab_id, "profile": profile}
params = {
"key": key,
"selector": selector,
"tab_id": tab_id,
"profile": profile,
"modifiers": modifiers,
}
bridge = get_bridge()
if not bridge or not bridge.is_connected:
@@ -312,7 +345,9 @@ def register_interaction_tools(mcp: FastMCP) -> None:
return result
try:
press_result = await bridge.press_key(target_tab, key, selector=selector)
press_result = await bridge.press_key(
target_tab, key, selector=selector, modifiers=modifiers
)
log_tool_call(
"browser_press",
params,