fix: tool credential filter
This commit is contained in:
@@ -19,11 +19,18 @@
|
||||
"Bash(find /home/timothy/aden/hive/core/framework -name \"*.py\" -type f -exec grep -l \"FileConversationStore\\\\|class.*ConversationStore\" {} \\\\;)",
|
||||
"Bash(find /home/timothy/aden/hive/core/framework -name \"*.py\" -exec grep -l \"run_parallel_workers\\\\|create_colony\" {} \\\\;)",
|
||||
"Bash(awk '/^ async def execute\\\\\\(self, ctx: AgentContext\\\\\\)/,/^ async def [a-z_]+/ {print NR\": \"$0}' /home/timothy/aden/hive/core/framework/agent_loop/agent_loop.py)",
|
||||
"Bash(grep -r \"max_concurrent_workers\\\\|max_depth\\\\|recursion\\\\|spawn.*bomb\" /home/timothy/aden/hive/core/framework/host/*.py)"
|
||||
"Bash(grep -r \"max_concurrent_workers\\\\|max_depth\\\\|recursion\\\\|spawn.*bomb\" /home/timothy/aden/hive/core/framework/host/*.py)",
|
||||
"Bash(wc -l /home/timothy/aden/hive/tools/src/gcu/browser/*.py /home/timothy/aden/hive/tools/src/gcu/browser/tools/*.py)",
|
||||
"Bash(file /tmp/gcu_verify/*.png)",
|
||||
"Bash(ps -eo pid,cmd)",
|
||||
"Bash(ps -o pid,lstart,cmd -p 746640)",
|
||||
"Bash(kill 746636)",
|
||||
"Bash(ps -eo pid,lstart,cmd)"
|
||||
],
|
||||
"additionalDirectories": [
|
||||
"/home/timothy/.hive/skills/writing-hive-skills",
|
||||
"/tmp"
|
||||
"/tmp",
|
||||
"/home/timothy/.hive/skills"
|
||||
]
|
||||
},
|
||||
"hooks": {
|
||||
|
||||
@@ -85,6 +85,7 @@ from .template import TemplateResolver
|
||||
from .validation import (
|
||||
CredentialStatus,
|
||||
CredentialValidationResult,
|
||||
compute_unavailable_tools,
|
||||
ensure_credential_key_env,
|
||||
validate_agent_credentials,
|
||||
)
|
||||
@@ -150,6 +151,7 @@ __all__ = [
|
||||
# Validation
|
||||
"ensure_credential_key_env",
|
||||
"validate_agent_credentials",
|
||||
"compute_unavailable_tools",
|
||||
"CredentialStatus",
|
||||
"CredentialValidationResult",
|
||||
# Interactive setup
|
||||
|
||||
@@ -236,6 +236,46 @@ def _presync_aden_tokens(credential_specs: dict, *, force: bool = False) -> None
|
||||
)
|
||||
|
||||
|
||||
def compute_unavailable_tools(nodes: list) -> tuple[set[str], list[str]]:
|
||||
"""Return (tool_names_to_drop, human_messages).
|
||||
|
||||
Runs credential validation *without* raising, collects every tool
|
||||
bound to a failed credential (missing / invalid / Aden-not-connected
|
||||
and no alternative provider available), and returns the set of tool
|
||||
names that should be silently dropped from the worker's effective
|
||||
tool list.
|
||||
|
||||
Use this at every worker-spawn preflight so missing credentials
|
||||
filter tools out of the graph instead of hard-failing the whole
|
||||
spawn. Only affects non-MCP tools — the MCP admission gate
|
||||
(``_build_mcp_admission_gate``) already handles MCP tools at
|
||||
registration time.
|
||||
"""
|
||||
try:
|
||||
result = validate_agent_credentials(nodes, verify=False, raise_on_error=False)
|
||||
except Exception as exc:
|
||||
logger.debug("compute_unavailable_tools: validation raised: %s", exc)
|
||||
return set(), []
|
||||
|
||||
drop: set[str] = set()
|
||||
messages: list[str] = []
|
||||
for status in result.failed:
|
||||
if not status.tools:
|
||||
continue
|
||||
drop.update(status.tools)
|
||||
reason = "missing"
|
||||
if status.aden_not_connected:
|
||||
reason = "aden_not_connected"
|
||||
elif status.available and status.valid is False:
|
||||
reason = "invalid"
|
||||
messages.append(
|
||||
f"{status.env_var} ({reason}) → drops {len(status.tools)} tool(s): "
|
||||
f"{', '.join(status.tools[:6])}"
|
||||
+ (f" +{len(status.tools) - 6} more" if len(status.tools) > 6 else "")
|
||||
)
|
||||
return drop, messages
|
||||
|
||||
|
||||
def validate_agent_credentials(
|
||||
nodes: list,
|
||||
quiet: bool = False,
|
||||
|
||||
@@ -44,7 +44,7 @@ from typing import TYPE_CHECKING, Any
|
||||
|
||||
from framework.credentials.models import CredentialError
|
||||
from framework.host.event_bus import AgentEvent, EventType
|
||||
from framework.loader.preload_validation import credential_errors_to_json, validate_credentials
|
||||
from framework.loader.preload_validation import credential_errors_to_json
|
||||
from framework.server.app import validate_agent_path
|
||||
from framework.tools.flowchart_utils import (
|
||||
FLOWCHART_TYPES,
|
||||
@@ -3875,24 +3875,50 @@ def register_queen_lifecycle_tools(
|
||||
)
|
||||
|
||||
try:
|
||||
# Pre-flight: validate credentials and resync MCP servers.
|
||||
# Still uses the legacy AgentHost handles because that's
|
||||
# where credentials live; the actual run is via colony.
|
||||
# Pre-flight: compute the set of tools whose credentials are
|
||||
# NOT currently available, and resync MCP servers. We do NOT
|
||||
# hard-fail on missing credentials anymore — instead we drop
|
||||
# the affected tools from the worker's spawn_tools list a
|
||||
# few lines below. Hard-failing here caused unrelated tools
|
||||
# (e.g. GitHub tools leaking into a LinkedIn worker config)
|
||||
# to block the whole spawn with a CredentialError; the fix
|
||||
# is to treat unset credentials as "drop these tools" rather
|
||||
# than "abort the worker".
|
||||
#
|
||||
# Note: the MCP admission gate (_build_mcp_admission_gate in
|
||||
# tool_registry.py) already filters MCP tools at registration
|
||||
# time. This preflight covers the non-MCP path — tools.py
|
||||
# discoveries via discover_from_module — which has no
|
||||
# credential gate of its own.
|
||||
loop = asyncio.get_running_loop()
|
||||
unavailable_tools: set[str] = set()
|
||||
|
||||
async def _preflight():
|
||||
cred_error: CredentialError | None = None
|
||||
nonlocal unavailable_tools
|
||||
try:
|
||||
await loop.run_in_executor(
|
||||
from framework.credentials.validation import compute_unavailable_tools
|
||||
|
||||
drop, messages = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: validate_credentials(
|
||||
legacy.graph.nodes,
|
||||
interactive=False,
|
||||
skip=False,
|
||||
),
|
||||
lambda: compute_unavailable_tools(legacy.graph.nodes),
|
||||
)
|
||||
unavailable_tools = drop
|
||||
if drop:
|
||||
logger.warning(
|
||||
"run_agent_with_input: dropping %d tool(s) with "
|
||||
"unavailable credentials from worker spawn: %s",
|
||||
len(drop),
|
||||
"; ".join(messages),
|
||||
)
|
||||
except Exception as exc:
|
||||
# Validation itself failing (not a credential failure —
|
||||
# a code error in the validator) should not block the
|
||||
# spawn. Log and proceed as if nothing was dropped.
|
||||
logger.warning(
|
||||
"compute_unavailable_tools raised, proceeding without "
|
||||
"credential-based tool filtering: %s",
|
||||
exc,
|
||||
)
|
||||
except CredentialError as e:
|
||||
cred_error = e
|
||||
|
||||
runner = getattr(session, "runner", None)
|
||||
if runner:
|
||||
@@ -3904,9 +3930,6 @@ def register_queen_lifecycle_tools(
|
||||
except Exception as e:
|
||||
logger.warning("MCP resync failed: %s", e)
|
||||
|
||||
if cred_error is not None:
|
||||
raise cred_error
|
||||
|
||||
try:
|
||||
await asyncio.wait_for(_preflight(), timeout=_START_PREFLIGHT_TIMEOUT)
|
||||
except TimeoutError:
|
||||
@@ -3914,8 +3937,6 @@ def register_queen_lifecycle_tools(
|
||||
"run_agent_with_input preflight timed out after %ds — proceeding",
|
||||
_START_PREFLIGHT_TIMEOUT,
|
||||
)
|
||||
except CredentialError:
|
||||
raise # handled below
|
||||
|
||||
# Build a per-spawn AgentSpec that mirrors the loaded
|
||||
# worker's entry-node identity. This is what makes the
|
||||
@@ -3944,6 +3965,24 @@ def register_queen_lifecycle_tools(
|
||||
else []
|
||||
)
|
||||
|
||||
# Drop any tool whose credential isn't available (GitHub
|
||||
# tools when GITHUB_TOKEN is unset, etc). The preflight
|
||||
# above populated ``unavailable_tools``; apply the filter
|
||||
# HERE — before the AgentSpec is built — so the worker
|
||||
# only sees tools it can actually run.
|
||||
dropped_from_names: list[str] = []
|
||||
if unavailable_tools:
|
||||
original = worker_tool_names
|
||||
worker_tool_names = [t for t in original if t not in unavailable_tools]
|
||||
dropped_from_names = [t for t in original if t in unavailable_tools]
|
||||
if dropped_from_names:
|
||||
logger.warning(
|
||||
"run_agent_with_input: dropped %d tool(s) from worker "
|
||||
"AgentSpec due to unavailable credentials: %s",
|
||||
len(dropped_from_names),
|
||||
", ".join(dropped_from_names),
|
||||
)
|
||||
|
||||
spawn_spec = AgentSpec(
|
||||
id=f"loaded_worker:{getattr(graph, 'id', 'unknown')}",
|
||||
name=getattr(graph, "id", "loaded_worker"),
|
||||
@@ -3962,6 +4001,26 @@ def register_queen_lifecycle_tools(
|
||||
spawn_tools = list(getattr(legacy, "_tools", []) or [])
|
||||
spawn_tool_executor = getattr(legacy, "_tool_executor", None)
|
||||
|
||||
# Same credential-based filter on the live Tool objects
|
||||
# passed to the worker. Without this the worker would still
|
||||
# receive the GitHub tool definitions in its registry —
|
||||
# it just wouldn't see them in its AgentSpec, so the LLM
|
||||
# wouldn't know to use them. Dropping from both lists
|
||||
# makes the filter complete.
|
||||
if unavailable_tools:
|
||||
before = len(spawn_tools)
|
||||
spawn_tools = [
|
||||
t for t in spawn_tools
|
||||
if getattr(t, "name", None) not in unavailable_tools
|
||||
]
|
||||
dropped_count = before - len(spawn_tools)
|
||||
if dropped_count:
|
||||
logger.info(
|
||||
"run_agent_with_input: dropped %d tool object(s) from "
|
||||
"spawn_tools (unavailable credentials)",
|
||||
dropped_count,
|
||||
)
|
||||
|
||||
worker_ids = await colony.spawn(
|
||||
task=task,
|
||||
count=1,
|
||||
|
||||
+393
-68
@@ -123,6 +123,21 @@ class BeelineBridge:
|
||||
logger.warning("Bridge status server could not start on port %d: %s", status_port, e)
|
||||
|
||||
async def stop(self) -> None:
|
||||
# Cancel in-flight bridge requests so any caller stuck in _send
|
||||
# sees CancelledError immediately instead of waiting the full
|
||||
# 30s timeout. Mirrors the cleanup in _handle_connection's
|
||||
# disconnect branch so both exit paths behave the same.
|
||||
for fut in self._pending.values():
|
||||
if not fut.done():
|
||||
fut.cancel()
|
||||
self._pending.clear()
|
||||
# Drop CDP attach cache — next run must re-attach fresh.
|
||||
self._cdp_attached.clear()
|
||||
# Drop highlight state — stale entries would otherwise carry
|
||||
# over into a subsequent run and confuse screenshot annotation.
|
||||
_interaction_highlights.clear()
|
||||
self._ws = None
|
||||
|
||||
if self._server:
|
||||
self._server.close()
|
||||
try:
|
||||
@@ -222,7 +237,14 @@ class BeelineBridge:
|
||||
fut.cancel()
|
||||
self._pending.clear()
|
||||
|
||||
async def _send(self, type_: str, **params) -> dict:
|
||||
# Default wait on a bridge command. Callers with known-slow ops
|
||||
# (full-page screenshots on slow networks, AX tree on huge pages)
|
||||
# can pass a longer value via _send(..., timeout=...). Using the
|
||||
# same default as the old hard-coded value so existing call sites
|
||||
# don't regress.
|
||||
_DEFAULT_SEND_TIMEOUT_S: float = 30.0
|
||||
|
||||
async def _send(self, type_: str, *, timeout: float | None = None, **params) -> dict:
|
||||
"""Send a command to the extension and wait for the result."""
|
||||
if not self._ws:
|
||||
raise RuntimeError("Extension not connected")
|
||||
@@ -231,27 +253,58 @@ class BeelineBridge:
|
||||
fut: asyncio.Future = asyncio.get_event_loop().create_future()
|
||||
self._pending[msg_id] = fut
|
||||
start = time.perf_counter()
|
||||
effective_timeout = timeout if timeout is not None else self._DEFAULT_SEND_TIMEOUT_S
|
||||
|
||||
log_bridge_message("send", type_, msg_id=msg_id, params=params)
|
||||
|
||||
try:
|
||||
await self._ws.send(json.dumps({"id": msg_id, "type": type_, **params}))
|
||||
result = await asyncio.wait_for(fut, timeout=30.0)
|
||||
result = await asyncio.wait_for(fut, timeout=effective_timeout)
|
||||
duration_ms = (time.perf_counter() - start) * 1000
|
||||
log_bridge_message("send", type_, msg_id=msg_id, result=result, duration_ms=duration_ms)
|
||||
return result
|
||||
except TimeoutError:
|
||||
self._pending.pop(msg_id, None)
|
||||
log_bridge_message("send", type_, msg_id=msg_id, error="timeout")
|
||||
raise RuntimeError(f"Bridge command '{type_}' timed out") from None
|
||||
# Include which CDP method (if any) so the caller can see
|
||||
# what actually hung — the generic 'cdp' type is useless
|
||||
# when ten different CDP calls use the same type.
|
||||
detail = f" method={params.get('method')}" if params.get("method") else ""
|
||||
raise RuntimeError(
|
||||
f"Bridge command '{type_}'{detail} timed out after {effective_timeout:.0f}s"
|
||||
) from None
|
||||
except BaseException:
|
||||
# CancelledError or any other exception — remove stale future so a late
|
||||
# response from the extension doesn't try to resolve a cancelled future.
|
||||
self._pending.pop(msg_id, None)
|
||||
raise
|
||||
|
||||
# Substrings that indicate Chrome detached the debugger out from
|
||||
# under us (tab closed, user opened DevTools, cross-origin nav).
|
||||
# Our in-memory _cdp_attached set is now stale; next call should
|
||||
# re-attach rather than reporting a cryptic "Target not found".
|
||||
_CDP_DEAD_SESSION_MARKERS = (
|
||||
"target closed",
|
||||
"target not found",
|
||||
"not attached",
|
||||
"session closed",
|
||||
"inspector already attached",
|
||||
"no target with given id",
|
||||
)
|
||||
|
||||
def _is_cdp_dead_session(self, exc: BaseException) -> bool:
|
||||
msg = str(exc).lower()
|
||||
return any(m in msg for m in self._CDP_DEAD_SESSION_MARKERS)
|
||||
|
||||
async def _cdp(self, tab_id: int, method: str, params: dict | None = None) -> dict:
|
||||
"""Send a CDP command to a tab."""
|
||||
"""Send a CDP command to a tab.
|
||||
|
||||
On a dead-session error (Chrome detached externally — tab closed,
|
||||
DevTools opened, cross-origin nav), evict the stale attach
|
||||
cache entry, reattach, and retry once. Without this the Python
|
||||
side would keep assuming it's attached and every subsequent call
|
||||
would hit the same error until someone restarted the bridge.
|
||||
"""
|
||||
start = time.perf_counter()
|
||||
try:
|
||||
result = await self._send("cdp", tabId=tab_id, method=method, params=params or {})
|
||||
@@ -261,6 +314,33 @@ class BeelineBridge:
|
||||
except Exception as e:
|
||||
duration_ms = (time.perf_counter() - start) * 1000
|
||||
log_cdp_command(tab_id, method, params, error=str(e), duration_ms=duration_ms)
|
||||
if self._is_cdp_dead_session(e):
|
||||
logger.info(
|
||||
"CDP session for tab %d looks dead (%s) — re-attaching and retrying",
|
||||
tab_id,
|
||||
str(e)[:120],
|
||||
)
|
||||
self._cdp_attached.discard(tab_id)
|
||||
try:
|
||||
reattach = await self._send("cdp.attach", tabId=tab_id)
|
||||
if reattach.get("ok"):
|
||||
self._cdp_attached.add(tab_id)
|
||||
retry_start = time.perf_counter()
|
||||
result = await self._send(
|
||||
"cdp", tabId=tab_id, method=method, params=params or {}
|
||||
)
|
||||
log_cdp_command(
|
||||
tab_id,
|
||||
method,
|
||||
params,
|
||||
result,
|
||||
duration_ms=(time.perf_counter() - retry_start) * 1000,
|
||||
)
|
||||
return result
|
||||
except Exception as retry_exc:
|
||||
logger.debug(
|
||||
"CDP reattach+retry for tab %d failed: %s", tab_id, retry_exc
|
||||
)
|
||||
raise
|
||||
|
||||
async def _try_enable_domain(self, tab_id: int, domain: str) -> None:
|
||||
@@ -311,7 +391,14 @@ class BeelineBridge:
|
||||
|
||||
async def close_tab(self, tab_id: int) -> dict:
|
||||
"""Close a tab by ID."""
|
||||
return await self._send("tab.close", tabId=tab_id)
|
||||
result = await self._send("tab.close", tabId=tab_id)
|
||||
# Drop per-tab state — the id may be reused by Chrome much
|
||||
# later, and carrying a stale highlight or "attached" flag
|
||||
# forward would misannotate screenshots or skip a needed
|
||||
# reattach on the reused id.
|
||||
self._cdp_attached.discard(tab_id)
|
||||
_interaction_highlights.pop(tab_id, None)
|
||||
return result
|
||||
|
||||
async def list_tabs(self, group_id: int | None = None) -> dict:
|
||||
"""List tabs, optionally filtered by group.
|
||||
@@ -361,6 +448,11 @@ class BeelineBridge:
|
||||
if wait_until not in VALID_WAIT_UNTIL:
|
||||
wait_until = "load"
|
||||
|
||||
# Drop the stale interaction highlight before loading a new
|
||||
# page — otherwise the next screenshot will annotate the new
|
||||
# page with a rect from the previous page's coordinate system.
|
||||
_interaction_highlights.pop(tab_id, None)
|
||||
|
||||
# Attach debugger if needed
|
||||
await self.cdp_attach(tab_id)
|
||||
|
||||
@@ -382,9 +474,11 @@ class BeelineBridge:
|
||||
"Runtime.evaluate",
|
||||
{"expression": "document.readyState", "returnByValue": True},
|
||||
)
|
||||
ready_state = (
|
||||
(eval_result or {}).get("result", {}).get("result", {}).get("value", "")
|
||||
)
|
||||
# _cdp returns the CDP response body; Runtime.evaluate shape
|
||||
# is {"result": {"type": ..., "value": ...}} — one "result"
|
||||
# hop, not two. The extra hop was always returning "" and
|
||||
# this entire lifecycle loop was running until the deadline.
|
||||
ready_state = (eval_result or {}).get("result", {}).get("value", "")
|
||||
|
||||
if wait_until == "domcontentloaded" and ready_state in ("interactive", "complete"):
|
||||
break
|
||||
@@ -416,17 +510,31 @@ class BeelineBridge:
|
||||
return {
|
||||
"ok": True,
|
||||
"tabId": tab_id,
|
||||
"url": (url_result or {}).get("result", {}).get("result", {}).get("value", ""),
|
||||
"title": (title_result or {}).get("result", {}).get("result", {}).get("value", ""),
|
||||
"url": (url_result or {}).get("result", {}).get("value", ""),
|
||||
"title": (title_result or {}).get("result", {}).get("value", ""),
|
||||
}
|
||||
|
||||
async def go_back(self, tab_id: int) -> dict:
|
||||
"""Navigate back in history."""
|
||||
"""Navigate back in history.
|
||||
|
||||
Uses ``history.back()`` via Runtime.evaluate — modern Chrome CDP
|
||||
no longer exposes ``Page.goBack`` / ``Page.goForward`` (removed
|
||||
in favour of ``Page.navigateToHistoryEntry``, which requires
|
||||
first fetching the history list). ``history.back()`` is simpler,
|
||||
works across every Chrome version, and matches what the user
|
||||
expects when they call ``browser_go_back``.
|
||||
"""
|
||||
_interaction_highlights.pop(tab_id, None)
|
||||
await self.cdp_attach(tab_id)
|
||||
await self._cdp(tab_id, "Page.enable")
|
||||
await self._cdp(tab_id, "Page.goBack")
|
||||
|
||||
# Get current URL
|
||||
await self._cdp(
|
||||
tab_id,
|
||||
"Runtime.evaluate",
|
||||
{"expression": "history.back()", "returnByValue": True},
|
||||
)
|
||||
# Give the browser a beat to commit the navigation before we
|
||||
# read the new URL.
|
||||
await asyncio.sleep(0.3)
|
||||
result = await self._cdp(
|
||||
tab_id,
|
||||
"Runtime.evaluate",
|
||||
@@ -435,15 +543,20 @@ class BeelineBridge:
|
||||
return {
|
||||
"ok": True,
|
||||
"action": "back",
|
||||
"url": (result or {}).get("result", {}).get("result", {}).get("value", ""),
|
||||
"url": (result or {}).get("result", {}).get("value", ""),
|
||||
}
|
||||
|
||||
async def go_forward(self, tab_id: int) -> dict:
|
||||
"""Navigate forward in history."""
|
||||
"""Navigate forward in history. See go_back() for why we use JS."""
|
||||
_interaction_highlights.pop(tab_id, None)
|
||||
await self.cdp_attach(tab_id)
|
||||
await self._cdp(tab_id, "Page.enable")
|
||||
await self._cdp(tab_id, "Page.goForward")
|
||||
|
||||
await self._cdp(
|
||||
tab_id,
|
||||
"Runtime.evaluate",
|
||||
{"expression": "history.forward()", "returnByValue": True},
|
||||
)
|
||||
await asyncio.sleep(0.3)
|
||||
result = await self._cdp(
|
||||
tab_id,
|
||||
"Runtime.evaluate",
|
||||
@@ -452,11 +565,12 @@ class BeelineBridge:
|
||||
return {
|
||||
"ok": True,
|
||||
"action": "forward",
|
||||
"url": (result or {}).get("result", {}).get("result", {}).get("value", ""),
|
||||
"url": (result or {}).get("result", {}).get("value", ""),
|
||||
}
|
||||
|
||||
async def reload(self, tab_id: int) -> dict:
|
||||
"""Reload the page."""
|
||||
_interaction_highlights.pop(tab_id, None)
|
||||
await self.cdp_attach(tab_id)
|
||||
await self._cdp(tab_id, "Page.enable")
|
||||
await self._cdp(tab_id, "Page.reload")
|
||||
@@ -469,7 +583,7 @@ class BeelineBridge:
|
||||
return {
|
||||
"ok": True,
|
||||
"action": "reload",
|
||||
"url": (result or {}).get("result", {}).get("result", {}).get("value", ""),
|
||||
"url": (result or {}).get("result", {}).get("value", ""),
|
||||
}
|
||||
|
||||
# ── Interaction ────────────────────────────────────────────────────────────
|
||||
@@ -759,75 +873,150 @@ class BeelineBridge:
|
||||
clear_first: bool = True,
|
||||
delay_ms: int = 0,
|
||||
timeout_ms: int = 30000,
|
||||
use_insert_text: bool = True,
|
||||
) -> dict:
|
||||
"""Type text into an element.
|
||||
|
||||
Uses JavaScript focus for reliability, then CDP key events.
|
||||
Routes through a real CDP pointer click on the target rect BEFORE
|
||||
inserting text. This is critical for rich-text editors (Draft.js,
|
||||
Lexical, ProseMirror, React-controlled contenteditable): those
|
||||
frameworks only register input as "real" after seeing a native
|
||||
focus event sourced from a real pointer interaction — a
|
||||
JS-sourced ``el.focus()`` is ignored, and the submit button
|
||||
stays disabled because the framework's internal state never
|
||||
updates. Sending a CDP click first fires the real
|
||||
pointerdown/pointerup/click/focus sequence that every modern
|
||||
framework listens to.
|
||||
|
||||
After clicking, we insert text via ``Input.insertText`` by
|
||||
default (``use_insert_text=True``). insertText is a dedicated
|
||||
CDP method that asks the browser to commit text into the
|
||||
focused element as if IME just committed it — it works
|
||||
cleanly on rich editors where per-character keyDown events
|
||||
would otherwise be eaten or mis-timed (empirically verified
|
||||
against LinkedIn's Lexical message composer 2026-04-11).
|
||||
Playwright uses the same approach under the hood.
|
||||
|
||||
Set ``use_insert_text=False`` to get the old per-character
|
||||
keyDown/keyUp path when an editor needs precise keystroke
|
||||
timing (autocomplete triggers, code editors that fire on
|
||||
specific chars, ``delay_ms`` typing animations).
|
||||
"""
|
||||
await self.cdp_attach(tab_id)
|
||||
await self._try_enable_domain(tab_id, "DOM")
|
||||
await self._try_enable_domain(tab_id, "Input")
|
||||
await self._try_enable_domain(tab_id, "Runtime")
|
||||
|
||||
# First, scroll into view and focus via JavaScript (more reliable than CDP)
|
||||
# Find + scroll + (optionally) clear via JS. We still need the
|
||||
# rect, and clearing via `.value = ''` / `.textContent = ''`
|
||||
# is the most reliable way to reset pre-existing content.
|
||||
focus_script = f"""
|
||||
(function() {{
|
||||
const el = document.querySelector({json.dumps(selector)});
|
||||
if (!el) return false;
|
||||
if (!el) return null;
|
||||
|
||||
// Scroll into view
|
||||
// Scroll into view so the click lands in-viewport.
|
||||
el.scrollIntoView({{ block: 'center' }});
|
||||
|
||||
// Focus the element
|
||||
el.focus();
|
||||
|
||||
// Clear if requested
|
||||
// Clear if requested.
|
||||
if ({str(clear_first).lower()}) {{
|
||||
if (el.value !== undefined) {{
|
||||
el.value = '';
|
||||
// Nudge React's onChange — the framework reads
|
||||
// .value via a setter hook, and without firing
|
||||
// an input event the component state remains
|
||||
// stale after our value assignment.
|
||||
el.dispatchEvent(new Event('input', {{bubbles: true}}));
|
||||
}} else if (el.isContentEditable) {{
|
||||
el.textContent = '';
|
||||
el.dispatchEvent(new Event('input', {{bubbles: true}}));
|
||||
}}
|
||||
}}
|
||||
|
||||
return true;
|
||||
const r = el.getBoundingClientRect();
|
||||
return {{
|
||||
x: r.left + r.width / 2,
|
||||
y: r.top + r.height / 2,
|
||||
w: r.width,
|
||||
h: r.height,
|
||||
}};
|
||||
}})();
|
||||
"""
|
||||
|
||||
focus_result = await self.evaluate(tab_id, focus_script)
|
||||
success = (focus_result or {}).get("result", False)
|
||||
rect = (focus_result or {}).get("result")
|
||||
|
||||
if not success:
|
||||
# Element not found - wait and retry
|
||||
if not rect:
|
||||
# Element not found — wait + retry until timeout.
|
||||
deadline = asyncio.get_event_loop().time() + timeout_ms / 1000
|
||||
while asyncio.get_event_loop().time() < deadline:
|
||||
result = await self.evaluate(tab_id, focus_script)
|
||||
if result and (result or {}).get("result", False):
|
||||
success = True
|
||||
rect = (result or {}).get("result") if result else None
|
||||
if rect:
|
||||
break
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
if not success:
|
||||
if not rect:
|
||||
return {"ok": False, "error": f"Element not found: {selector}"}
|
||||
|
||||
await asyncio.sleep(0.05) # Wait for focus to take effect
|
||||
if not rect.get("w") or not rect.get("h"):
|
||||
return {
|
||||
"ok": False,
|
||||
"error": f"Element has zero dimensions, can't click to focus: {selector}",
|
||||
}
|
||||
|
||||
# Type each character using CDP key events
|
||||
for char in text:
|
||||
# Dispatch key down
|
||||
await self._cdp(
|
||||
tab_id,
|
||||
"Input.dispatchKeyEvent",
|
||||
{"type": "keyDown", "text": char},
|
||||
)
|
||||
# Dispatch key up
|
||||
await self._cdp(
|
||||
tab_id,
|
||||
"Input.dispatchKeyEvent",
|
||||
{"type": "keyUp", "text": char},
|
||||
)
|
||||
if delay_ms > 0:
|
||||
await asyncio.sleep(delay_ms / 1000)
|
||||
# Fire a real CDP pointer click at the element's center. This is
|
||||
# what unblocks rich-text editors — JS el.focus() is not enough.
|
||||
click_x = rect["x"]
|
||||
click_y = rect["y"]
|
||||
await self._cdp(
|
||||
tab_id,
|
||||
"Input.dispatchMouseEvent",
|
||||
{"type": "mousePressed", "x": click_x, "y": click_y, "button": "left", "clickCount": 1},
|
||||
)
|
||||
await self._cdp(
|
||||
tab_id,
|
||||
"Input.dispatchMouseEvent",
|
||||
{"type": "mouseReleased", "x": click_x, "y": click_y, "button": "left", "clickCount": 1},
|
||||
)
|
||||
await asyncio.sleep(0.15) # Let focus / editor-init animations settle.
|
||||
|
||||
if use_insert_text and delay_ms <= 0:
|
||||
# CDP Input.insertText is the most reliable way to insert
|
||||
# text into a rich-text editor. It bypasses the keyboard
|
||||
# event pipeline entirely and commits text into the focused
|
||||
# element as if IME just committed it. Works on plain
|
||||
# <input>/<textarea>, contenteditable, Lexical, Draft.js,
|
||||
# ProseMirror, Monaco textarea buffers — verified empirically
|
||||
# against LinkedIn's message composer (Lexical) on 2026-04-11
|
||||
# where the per-char keyDown path left the editor empty.
|
||||
await self._cdp(tab_id, "Input.insertText", {"text": text})
|
||||
else:
|
||||
# Fallback path: per-character keyDown/keyUp with full key,
|
||||
# code, and text fields. Used when the caller explicitly
|
||||
# wants per-keystroke dispatch (autocomplete testing, code
|
||||
# editors that fire on specific chars, animated typing
|
||||
# with ``delay_ms``). Populating ``code`` for ASCII is
|
||||
# needed so frameworks that branch on ``event.code`` see
|
||||
# the right values.
|
||||
for char in text:
|
||||
key_params: dict[str, Any] = {
|
||||
"type": "keyDown",
|
||||
"text": char,
|
||||
"key": char,
|
||||
}
|
||||
if len(char) == 1 and char.isalpha():
|
||||
key_params["code"] = f"Key{char.upper()}"
|
||||
elif len(char) == 1 and char.isdigit():
|
||||
key_params["code"] = f"Digit{char}"
|
||||
await self._cdp(tab_id, "Input.dispatchKeyEvent", key_params)
|
||||
|
||||
key_up = {"type": "keyUp", "key": char}
|
||||
if "code" in key_params:
|
||||
key_up["code"] = key_params["code"]
|
||||
await self._cdp(tab_id, "Input.dispatchKeyEvent", key_up)
|
||||
if delay_ms > 0:
|
||||
await asyncio.sleep(delay_ms / 1000)
|
||||
|
||||
# Highlight the element that was typed into
|
||||
rect_result = await self.evaluate(
|
||||
@@ -844,12 +1033,47 @@ class BeelineBridge:
|
||||
)
|
||||
return {"ok": True, "action": "type", "selector": selector, "length": len(text)}
|
||||
|
||||
async def press_key(self, tab_id: int, key: str, selector: str | None = None) -> dict:
|
||||
"""Press a keyboard key.
|
||||
# CDP Input.dispatchKeyEvent modifiers bitmask.
|
||||
_CDP_MODIFIERS = {"alt": 1, "ctrl": 2, "control": 2, "meta": 4, "cmd": 4, "shift": 8}
|
||||
|
||||
# How Chrome expects each modifier key as its OWN keyDown event —
|
||||
# name, code, and Windows virtual key code. Dispatched before the
|
||||
# main key so Chrome sees the modifier as "held" during the main
|
||||
# event, which is what actually triggers browser shortcuts like
|
||||
# Ctrl+A, Cmd+L, Shift+Tab.
|
||||
_MODIFIER_KEYS = {
|
||||
"alt": {"key": "Alt", "code": "AltLeft", "windowsVirtualKeyCode": 18},
|
||||
"ctrl": {"key": "Control", "code": "ControlLeft", "windowsVirtualKeyCode": 17},
|
||||
"control": {"key": "Control", "code": "ControlLeft", "windowsVirtualKeyCode": 17},
|
||||
"meta": {"key": "Meta", "code": "MetaLeft", "windowsVirtualKeyCode": 91},
|
||||
"cmd": {"key": "Meta", "code": "MetaLeft", "windowsVirtualKeyCode": 91},
|
||||
"shift": {"key": "Shift", "code": "ShiftLeft", "windowsVirtualKeyCode": 16},
|
||||
}
|
||||
|
||||
def _cdp_modifier_mask(self, modifiers: list[str] | None) -> int:
|
||||
if not modifiers:
|
||||
return 0
|
||||
mask = 0
|
||||
for m in modifiers:
|
||||
mask |= self._CDP_MODIFIERS.get(m.lower(), 0)
|
||||
return mask
|
||||
|
||||
async def press_key(
|
||||
self,
|
||||
tab_id: int,
|
||||
key: str,
|
||||
selector: str | None = None,
|
||||
modifiers: list[str] | None = None,
|
||||
) -> dict:
|
||||
"""Press a keyboard key, optionally with modifier keys held.
|
||||
|
||||
Args:
|
||||
key: Key name like 'Enter', 'Tab', 'Escape', 'ArrowDown', etc.
|
||||
selector: Optional selector to focus first
|
||||
modifiers: Optional list of modifier keys to hold while pressing
|
||||
``key``. Accepted values: "alt", "ctrl"/"control", "meta"/"cmd",
|
||||
"shift". Example: ``modifiers=["ctrl"]`` → Ctrl+key, which
|
||||
enables shortcuts like Ctrl+A, Ctrl+L, Cmd+Enter, Shift+Tab.
|
||||
"""
|
||||
await self.cdp_attach(tab_id)
|
||||
await self._try_enable_domain(tab_id, "Input")
|
||||
@@ -882,19 +1106,110 @@ class BeelineBridge:
|
||||
}
|
||||
|
||||
text, key_name = key_map.get(key, (key, key))
|
||||
mod_mask = self._cdp_modifier_mask(modifiers)
|
||||
|
||||
await self._cdp(
|
||||
tab_id,
|
||||
"Input.dispatchKeyEvent",
|
||||
{"type": "keyDown", "key": key_name, "text": text if text else None},
|
||||
)
|
||||
await self._cdp(
|
||||
tab_id,
|
||||
"Input.dispatchKeyEvent",
|
||||
{"type": "keyUp", "key": key_name, "text": text if text else None},
|
||||
)
|
||||
# With modifiers held, suppress the printable text so that
|
||||
# e.g. Ctrl+A doesn't also type the character "a" into the
|
||||
# focused field (CDP will still fire the shortcut).
|
||||
effective_text = text if (text and mod_mask == 0) else None
|
||||
|
||||
return {"ok": True, "action": "press", "key": key}
|
||||
# Compute ``code`` and ``windowsVirtualKeyCode`` for the main
|
||||
# key. These are MANDATORY for Chrome's shortcut dispatcher —
|
||||
# without them, Ctrl+A etc. reach the DOM with ``code=""`` and
|
||||
# ``which=0`` and Chrome doesn't recognise them as shortcuts.
|
||||
# Verified empirically on chrome 131 against a real input.
|
||||
main_code: str | None = None
|
||||
main_vk: int | None = None
|
||||
special_vk = {
|
||||
"Enter": (13, "Enter"),
|
||||
"Tab": (9, "Tab"),
|
||||
"Escape": (27, "Escape"),
|
||||
"Backspace": (8, "Backspace"),
|
||||
"Delete": (46, "Delete"),
|
||||
"ArrowUp": (38, "ArrowUp"),
|
||||
"ArrowDown": (40, "ArrowDown"),
|
||||
"ArrowLeft": (37, "ArrowLeft"),
|
||||
"ArrowRight": (39, "ArrowRight"),
|
||||
"Home": (36, "Home"),
|
||||
"End": (35, "End"),
|
||||
"PageUp": (33, "PageUp"),
|
||||
"PageDown": (34, "PageDown"),
|
||||
}
|
||||
if key_name in special_vk:
|
||||
main_vk, main_code = special_vk[key_name]
|
||||
elif len(key_name) == 1 and key_name.isalpha():
|
||||
main_code = f"Key{key_name.upper()}"
|
||||
main_vk = ord(key_name.upper()) # 'A' = 65 ... 'Z' = 90
|
||||
elif len(key_name) == 1 and key_name.isdigit():
|
||||
main_code = f"Digit{key_name}"
|
||||
main_vk = ord(key_name) # '0' = 48 ... '9' = 57
|
||||
|
||||
# Press each modifier as a separate keyDown BEFORE the main
|
||||
# key. Sending ``modifiers: mask`` on the main key alone isn't
|
||||
# enough — Chrome's shortcut dispatcher looks for a held
|
||||
# modifier event, not just a flag. Matches the Playwright /
|
||||
# Puppeteer sequence. Release modifiers in reverse order after
|
||||
# the main key so the "held" state is correct throughout.
|
||||
pressed_mods: list[dict] = []
|
||||
if modifiers:
|
||||
for m in modifiers:
|
||||
spec = self._MODIFIER_KEYS.get(m.lower())
|
||||
if spec is None:
|
||||
continue
|
||||
await self._cdp(
|
||||
tab_id,
|
||||
"Input.dispatchKeyEvent",
|
||||
{
|
||||
"type": "keyDown",
|
||||
"key": spec["key"],
|
||||
"code": spec["code"],
|
||||
"windowsVirtualKeyCode": spec["windowsVirtualKeyCode"],
|
||||
"modifiers": mod_mask,
|
||||
},
|
||||
)
|
||||
pressed_mods.append(spec)
|
||||
|
||||
main_down: dict[str, Any] = {
|
||||
# Use rawKeyDown when a modifier is held so Chrome skips
|
||||
# text insertion and routes the event to the shortcut
|
||||
# dispatcher. For plain press_key without modifiers we can
|
||||
# use regular keyDown.
|
||||
"type": "rawKeyDown" if mod_mask else "keyDown",
|
||||
"key": key_name,
|
||||
"text": effective_text,
|
||||
"modifiers": mod_mask,
|
||||
}
|
||||
main_up: dict[str, Any] = {
|
||||
"type": "keyUp",
|
||||
"key": key_name,
|
||||
"text": effective_text,
|
||||
"modifiers": mod_mask,
|
||||
}
|
||||
if main_code is not None:
|
||||
main_down["code"] = main_code
|
||||
main_up["code"] = main_code
|
||||
if main_vk is not None:
|
||||
main_down["windowsVirtualKeyCode"] = main_vk
|
||||
main_up["windowsVirtualKeyCode"] = main_vk
|
||||
|
||||
await self._cdp(tab_id, "Input.dispatchKeyEvent", main_down)
|
||||
await self._cdp(tab_id, "Input.dispatchKeyEvent", main_up)
|
||||
|
||||
# Release modifiers in reverse order.
|
||||
for spec in reversed(pressed_mods):
|
||||
await self._cdp(
|
||||
tab_id,
|
||||
"Input.dispatchKeyEvent",
|
||||
{
|
||||
"type": "keyUp",
|
||||
"key": spec["key"],
|
||||
"code": spec["code"],
|
||||
"windowsVirtualKeyCode": spec["windowsVirtualKeyCode"],
|
||||
"modifiers": 0,
|
||||
},
|
||||
)
|
||||
|
||||
return {"ok": True, "action": "press", "key": key, "modifiers": modifiers or []}
|
||||
|
||||
# Shared JS snippet: shadow-piercing querySelector via ">>>" separator
|
||||
_SHADOW_QUERY_JS = """
|
||||
@@ -916,9 +1231,15 @@ class BeelineBridge:
|
||||
Example: '#interop-outlet >>> #ember37 >>> p'
|
||||
"""
|
||||
await self.cdp_attach(tab_id)
|
||||
# IMPORTANT: the whole script must be a single IIFE so that
|
||||
# bridge.evaluate() detects it as "already wrapped" and returns
|
||||
# its value. If you let evaluate() re-wrap a script that
|
||||
# starts with a function declaration, the outer wrapper
|
||||
# discards the inner IIFE's return and you always get None —
|
||||
# which is exactly the bug this code had until 2026-04-11.
|
||||
script = (
|
||||
f"{self._SHADOW_QUERY_JS}"
|
||||
f"(function(){{"
|
||||
f"{self._SHADOW_QUERY_JS}"
|
||||
f"const el=_shadowQuery({json.dumps(selector)});"
|
||||
f"if(!el)return null;"
|
||||
f"const r=el.getBoundingClientRect();"
|
||||
@@ -1945,7 +2266,10 @@ class BeelineBridge:
|
||||
"Runtime.evaluate",
|
||||
{"expression": script, "returnByValue": True},
|
||||
)
|
||||
found = (result or {}).get("result", {}).get("result", {}).get("value", False)
|
||||
# One "result" hop — see navigate() comment. This was silently
|
||||
# returning False on every poll, so wait_for_selector always
|
||||
# reported "not found" after the full timeout.
|
||||
found = (result or {}).get("result", {}).get("value", False)
|
||||
if found:
|
||||
return {"ok": True, "selector": selector}
|
||||
await asyncio.sleep(0.1)
|
||||
@@ -1969,7 +2293,8 @@ class BeelineBridge:
|
||||
"Runtime.evaluate",
|
||||
{"expression": script, "returnByValue": True},
|
||||
)
|
||||
found = (result or {}).get("result", {}).get("result", {}).get("value", False)
|
||||
# Same unwrap bug as wait_for_selector.
|
||||
found = (result or {}).get("result", {}).get("value", False)
|
||||
if found:
|
||||
return {"ok": True, "text": text}
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
@@ -178,18 +178,37 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
delay_ms: int = 0,
|
||||
clear_first: bool = True,
|
||||
timeout_ms: int = 30000,
|
||||
use_insert_text: bool = True,
|
||||
) -> dict:
|
||||
"""
|
||||
Type text into an input element.
|
||||
|
||||
Automatically routes through a real CDP pointer click on the
|
||||
element before inserting text — so that rich-text editors like
|
||||
Lexical (Gmail, LinkedIn DMs), Draft.js (X compose), and
|
||||
ProseMirror (Reddit) see a native focus event and enable their
|
||||
submit buttons. See the gcu-browser skill for the full "click-
|
||||
then-type" pattern.
|
||||
|
||||
By default uses CDP Input.insertText which is the most reliable
|
||||
way to insert text into rich editors. Set
|
||||
``use_insert_text=False`` to fall back to per-character
|
||||
keyDown/keyUp events (needed only for code editors that fire
|
||||
on specific keystrokes, or when ``delay_ms`` typing animation
|
||||
is required).
|
||||
|
||||
Args:
|
||||
selector: CSS selector for the input element
|
||||
text: Text to type
|
||||
tab_id: Chrome tab ID (default: active tab)
|
||||
profile: Browser profile name (default: "default")
|
||||
delay_ms: Delay between keystrokes in ms (default: 0)
|
||||
delay_ms: Delay between keystrokes in ms (default: 0).
|
||||
Forces the per-keystroke fallback when > 0.
|
||||
clear_first: Clear existing text before typing (default: True)
|
||||
timeout_ms: Timeout waiting for element (default: 30000)
|
||||
use_insert_text: Use CDP Input.insertText (default: True) for
|
||||
reliable insertion into rich-text editors.
|
||||
Set False for per-keystroke dispatch.
|
||||
|
||||
Returns:
|
||||
Dict with type result
|
||||
@@ -223,6 +242,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
clear_first=clear_first,
|
||||
delay_ms=delay_ms,
|
||||
timeout_ms=timeout_ms,
|
||||
use_insert_text=use_insert_text,
|
||||
)
|
||||
log_tool_call(
|
||||
"browser_type",
|
||||
@@ -277,21 +297,34 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
selector: str | None = None,
|
||||
tab_id: int | None = None,
|
||||
profile: str | None = None,
|
||||
modifiers: list[str] | None = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Press a keyboard key.
|
||||
Press a keyboard key, optionally with modifier keys held.
|
||||
|
||||
Args:
|
||||
key: Key to press (e.g., 'Enter', 'Tab', 'Escape', 'ArrowDown')
|
||||
key: Key to press (e.g., 'Enter', 'Tab', 'Escape', 'ArrowDown',
|
||||
or a character like 'a')
|
||||
selector: Focus element first (optional)
|
||||
tab_id: Chrome tab ID (default: active tab)
|
||||
profile: Browser profile name (default: "default")
|
||||
modifiers: Hold these modifier keys while pressing ``key``. Accepted
|
||||
values (case-insensitive): "alt", "ctrl"/"control", "meta"/"cmd",
|
||||
"shift". Examples: ``modifiers=["ctrl"], key="a"`` = Ctrl+A
|
||||
(select all); ``modifiers=["shift"], key="Tab"`` = Shift+Tab;
|
||||
``modifiers=["meta"], key="Enter"`` = Cmd+Enter.
|
||||
|
||||
Returns:
|
||||
Dict with press result
|
||||
"""
|
||||
start = time.perf_counter()
|
||||
params = {"key": key, "selector": selector, "tab_id": tab_id, "profile": profile}
|
||||
params = {
|
||||
"key": key,
|
||||
"selector": selector,
|
||||
"tab_id": tab_id,
|
||||
"profile": profile,
|
||||
"modifiers": modifiers,
|
||||
}
|
||||
|
||||
bridge = get_bridge()
|
||||
if not bridge or not bridge.is_connected:
|
||||
@@ -312,7 +345,9 @@ def register_interaction_tools(mcp: FastMCP) -> None:
|
||||
return result
|
||||
|
||||
try:
|
||||
press_result = await bridge.press_key(target_tab, key, selector=selector)
|
||||
press_result = await bridge.press_key(
|
||||
target_tab, key, selector=selector, modifiers=modifiers
|
||||
)
|
||||
log_tool_call(
|
||||
"browser_press",
|
||||
params,
|
||||
|
||||
Reference in New Issue
Block a user