fix: context health and eviction

This commit is contained in:
Timothy
2026-04-15 11:40:45 -07:00
parent 22df99ef51
commit 252710fb41
13 changed files with 633 additions and 108 deletions
+6 -1
View File
@@ -39,7 +39,12 @@
"Bash(bun run:*)",
"Bash(npx eslint:*)",
"Bash(npm run:*)",
"Bash(npm test:*)"
"Bash(npm test:*)",
"Bash(grep -n \"PIL\\\\|Image\\\\|to_thread\\\\|run_in_executor\" /home/timothy/aden/hive/tools/src/gcu/browser/*.py /home/timothy/aden/hive/tools/src/gcu/browser/tools/*.py)",
"WebFetch(domain:docs.litellm.ai)",
"Bash(cat /home/timothy/aden/hive/.venv/lib/python3.11/site-packages/litellm-*.dist-info/METADATA)",
"Bash(find \"/home/timothy/.hive/agents/queens/queen_brand_design/sessions/session_20260415_100751_d49f4c28/\" -type f -name \"*.json*\" -exec grep -l \"协日\" {} \\\\;)",
"Bash(grep -v ':0$')"
],
"additionalDirectories": [
"/home/timothy/.hive/skills/writing-hive-skills",
+48 -3
View File
@@ -3196,7 +3196,9 @@ class AgentLoop(AgentProtocol):
result = _build_tool_error_result(tc, raw)
else:
result = raw
results_by_id[tc.tool_use_id] = self._truncate_tool_result(result, tc.tool_name)
results_by_id[tc.tool_use_id] = await self._truncate_tool_result(
result, tc.tool_name
)
# Phase 3: record results into conversation in original order,
# build logged/real lists, and publish completed events.
@@ -3331,6 +3333,24 @@ class AgentLoop(AgentProtocol):
False,
)
# --- Image eviction: strip old screenshot image_content ---
# Screenshots from browser_screenshot are inlined as base64
# data URLs in message.image_content. Each screenshot costs
# ~250k tokens when the provider counts base64 as text
# (gemini, most non-Anthropic providers). Four screenshots
# in one conversation blew through gemini's 1M context in
# session_20260415_104727_5c4ed7ff and caused garbage
# output ("协日" as the final assistant text). We evict
# aggressively after every tool batch — independent of the
# char-based usage_ratio, which severely underestimates
# image cost (counts each image as ~2000 tokens vs the
# ~250k actually billed). Text metadata stays on the
# evicted messages so the agent can still reason about
# "I took a screenshot at step N".
_max_imgs = self._config.max_retained_screenshots
if _max_imgs >= 0:
await conversation.evict_old_images(keep_latest=_max_imgs)
# --- Mid-turn pruning: prevent context blowup within a single turn ---
if conversation.usage_ratio() >= 0.6:
protect = max(2000, self._config.max_context_tokens // 12)
@@ -3655,7 +3675,7 @@ class AgentLoop(AgentProtocol):
max_chars=max_chars,
)
def _truncate_tool_result(
async def _truncate_tool_result(
self,
result: ToolResult,
tool_name: str,
@@ -3671,8 +3691,33 @@ class AgentLoop(AgentProtocol):
- Large results (> limit): preview + file reference
- Errors: pass through unchanged
- read_file results: truncate with pagination hint (no re-spill)
For large results this does a synchronous JSON round-trip
(``json.loads`` + pretty-print ``json.dumps(indent=2)``) plus a
file write. On big payloads web_search, web_fetch, full-page
extractions this can block the event loop for hundreds of ms
per call. We offload to a worker thread so concurrent tool
executions keep running while one large result is being
pretty-printed and spilled to disk.
"""
return truncate_tool_result(
# Fast path: small results don't need thread offload. The
# function only touches disk / does heavy JSON work when the
# result exceeds either the truncation or spillover threshold,
# so cheap pass-throughs stay on the main loop.
needs_offload = (
len(result.content) > 10_000
and not result.is_error
)
if not needs_offload:
return truncate_tool_result(
result=result,
tool_name=tool_name,
max_tool_result_chars=self._config.max_tool_result_chars,
spillover_dir=self._config.spillover_dir,
next_spill_filename_fn=self._next_spill_filename,
)
return await asyncio.to_thread(
truncate_tool_result,
result=result,
tool_name=tool_name,
max_tool_result_chars=self._config.max_tool_result_chars,
+97 -11
View File
@@ -162,10 +162,17 @@ def update_run_cursor(
def _extract_spillover_filename(content: str) -> str | None:
"""Extract spillover filename from a tool result annotation.
Matches patterns produced by EventLoopNode._truncate_tool_result():
- Large result: "saved to 'web_search_1.txt'"
- Small result: "[Saved to 'web_search_1.txt']"
Matches patterns produced by ``truncate_tool_result``:
- New large-result header: "Full result saved at: /abs/path/file.txt"
- Legacy bracketed trailer: "[Saved to 'file.txt']" (pre-2026-04-15,
retained here so cold conversations still resolve)
"""
# New prose format — ``saved at: <absolute path>``, terminated by
# newline or end-of-string.
match = re.search(r"[Ss]aved at:\s*(\S+)", content)
if match:
return match.group(1)
# Legacy format.
match = re.search(r"[Ss]aved to '([^']+)'", content)
return match.group(1) if match else None
@@ -878,12 +885,14 @@ class NodeConversation:
if spillover:
placeholder = (
f"[Pruned tool result: {orig_len} chars. "
f"Full data in '{spillover}'. "
f"Use read_file('{spillover}') to retrieve.]"
f"Pruned tool result ({orig_len:,} chars) cleared from context. "
f"Full data saved at: {spillover}\n"
f"Read the complete data with read_file(path='{spillover}')."
)
else:
placeholder = f"[Pruned tool result: {orig_len} chars cleared from context.]"
placeholder = (
f"Pruned tool result ({orig_len:,} chars) cleared from context."
)
self._messages[i] = Message(
seq=msg.seq,
@@ -905,6 +914,81 @@ class NodeConversation:
self._last_api_input_tokens = None
return count
async def evict_old_images(self, keep_latest: int = 2) -> int:
"""Strip ``image_content`` from older messages, keeping the most recent.
Screenshots from ``browser_screenshot`` are inlined into the
message's ``image_content`` as base64 data URLs. Each screenshot
costs ~250k tokens when the provider counts the base64 as
text four screenshots push a conversation over gemini's 1M
context limit and trigger out-of-context garbage output (see
``session_20260415_104727_5c4ed7ff`` for the terminal case
where the model emitted ``协日`` as its final text then stopped).
This method walks backward through messages and keeps
``image_content`` intact on the most recent ``keep_latest``
messages that have images. Older messages get their
``image_content`` nulled out the text content (metadata
like url, dimensions, scale hints) stays, but the raw bytes
are dropped. Storage is updated too so cold-restore sees the
same evicted state.
Run this right after every tool result is recorded so image
context stays bounded even within a single iteration (the
compaction pipeline only fires at iteration boundaries, too
late for a single turn that takes 4 screenshots).
Returns the number of messages whose image_content was evicted.
"""
if not self._messages or keep_latest < 0:
return 0
# Find messages carrying images, walking newest → oldest.
image_indices: list[int] = []
for i in range(len(self._messages) - 1, -1, -1):
if self._messages[i].image_content:
image_indices.append(i)
# Nothing to evict if we have ≤ keep_latest images total.
if len(image_indices) <= keep_latest:
return 0
# Evict everything past the first keep_latest (newest) entries.
to_evict = image_indices[keep_latest:]
evicted = 0
for idx in to_evict:
msg = self._messages[idx]
self._messages[idx] = Message(
seq=msg.seq,
role=msg.role,
content=msg.content,
tool_use_id=msg.tool_use_id,
tool_calls=msg.tool_calls,
is_error=msg.is_error,
phase_id=msg.phase_id,
is_transition_marker=msg.is_transition_marker,
is_client_input=msg.is_client_input,
image_content=None, # ← dropped
is_skill_content=msg.is_skill_content,
run_id=msg.run_id,
)
evicted += 1
if self._store:
await self._store.write_part(
msg.seq, self._messages[idx].to_storage_dict()
)
if evicted:
# Reset token estimate — image blocks no longer contribute.
self._last_api_input_tokens = None
logger.info(
"evict_old_images: dropped image_content from %d message(s), "
"kept %d most recent",
evicted,
keep_latest,
)
return evicted
async def compact(
self,
summary: str,
@@ -1165,16 +1249,18 @@ class NodeConversation:
# Nothing to save — skip file creation
conv_filename = ""
# Build reference message
# Build reference message. Prose format (no brackets) — see the
# poison-pattern note on truncate_tool_result. Frontier models
# autocomplete `[...']` trailers into their own text turns.
ref_parts: list[str] = []
if conv_filename:
full_path = str((spill_path / conv_filename).resolve())
ref_parts.append(
f"[Previous conversation saved to '{full_path}'. "
f"Use read_file('{conv_filename}') to review if needed.]"
f"Previous conversation saved at: {full_path}\n"
f"Read the full transcript with read_file('{conv_filename}')."
)
elif not collapsed_msgs:
ref_parts.append("[Previous freeform messages compacted.]")
ref_parts.append("(Previous freeform messages compacted.)")
# Aggressive: add collapsed tool-call history to the reference
if collapsed_msgs:
@@ -102,12 +102,14 @@ def microcompact(
orig_len = len(msg.content)
if spillover:
placeholder = (
f"[Old tool result cleared: {orig_len} chars. "
f"Full data in '{spillover}'. "
f"Use read_file('{spillover}') to retrieve.]"
f"Old tool result ({orig_len:,} chars) cleared from context. "
f"Full data saved at: {spillover}\n"
f"Read the complete data with read_file(path='{spillover}')."
)
else:
placeholder = f"[Old tool result cleared: {orig_len} chars.]"
placeholder = (
f"Old tool result ({orig_len:,} chars) cleared from context."
)
# Mutate in-place (microcompact is synchronous, no store writes)
conversation._messages[i] = Message(
@@ -142,7 +144,14 @@ def _find_tool_name_for_result(messages: list[Message], tool_msg: Message) -> st
def _extract_spillover_filename_inline(content: str) -> str | None:
"""Quick inline check for spillover filename in tool result content."""
"""Quick inline check for spillover filename in tool result content.
Matches both the new prose format ("saved at: /path") and the
legacy bracketed trailer ("saved to '/path'").
"""
match = re.search(r"saved at:\s*(\S+)", content, re.IGNORECASE)
if match:
return match.group(1)
match = re.search(r"saved to '([^']+)'", content, re.IGNORECASE)
return match.group(1) if match else None
@@ -215,14 +215,30 @@ def truncate_tool_result(
"""Persist tool result to file and optionally truncate for context.
When *spillover_dir* is configured, EVERY non-error tool result is
saved to a file (short filename like ``web_search_1.txt``). A
``[Saved to '...']`` annotation is appended so the reference
survives pruning and compaction.
written to disk for debugging. The LLM-visible content is then
shaped to avoid a **poison pattern** that we traced on 2026-04-15
through a gemini-3.1-pro-preview queen session: the prior format
appended ``\\n\\n[Saved to '/abs/path/file.txt']`` after every
small result, and frontier pattern-matching models (gemini 3.x in
particular) learned to autocomplete the `[Saved to '...']` trailer
in their own assistant turns, eventually degenerating into echoing
the whole tool result instead of deciding what to do next. See
``session_20260415_100751_d49f4c28/conversations/parts/0000000056.json``
for the terminal case where the model's "text" output was the full
tool_result JSON.
- Small results ( limit): full content kept + file annotation
- Large results (> limit): preview + file reference
- Errors: pass through unchanged
- read_file results: truncate with pagination hint (no re-spill)
Rules after the fix:
- **Small results ( limit):** pass content through unchanged. No
trailer. No annotation. The full content is already in the
message; the disk copy is for debugging only.
- **Large results (> limit):** preview + file reference, but
formatted as plain prose instead of a bracketed ``[...]``
pattern. Structured JSON metadata ("_saved_to") is embedded
inside the JSON body when the preview is JSON-shaped so the
model can locate the full file without seeing a mimicry-prone
bracket token outside the body.
- **Errors:** pass through unchanged.
- **read_file results:** truncate with pagination hint (no re-spill).
"""
limit = max_tool_result_chars
@@ -252,18 +268,20 @@ def truncate_tool_result(
else:
preview_block = result.content[:PREVIEW_CAP] + ""
# Prose header (no brackets).
header = (
f"[{tool_name} result: {len(result.content):,} chars — "
f"too large for context. Use offset_bytes/limit_bytes "
f"parameters to read smaller chunks.]"
f"Tool `{tool_name}` returned {len(result.content):,} characters "
f"(too large for context). Use offset_bytes / limit_bytes "
f"parameters to paginate smaller chunks."
)
if metadata_str:
header += f"\n\nData structure:\n{metadata_str}"
header += (
"\n\nWARNING: This is an INCOMPLETE preview. Do NOT draw conclusions or counts from it."
"\n\nWARNING: the preview below is a SAMPLE only — do NOT "
"draw counts, totals, or conclusions from it."
)
truncated = f"{header}\n\nPreview (small sample only):\n{preview_block}"
truncated = f"{header}\n\nPreview (truncated):\n{preview_block}"
logger.info(
"%s result truncated: %d%d chars (use offset/limit to paginate)",
tool_name,
@@ -301,7 +319,10 @@ def truncate_tool_result(
if limit > 0 and len(result.content) > limit:
# Large result: build a small, metadata-rich preview so the
# LLM cannot mistake it for the complete dataset.
# LLM cannot mistake it for the complete dataset. The
# preview is introduced as plain prose (no bracketed
# ``[Result from …]`` token) so it doesn't prime the model
# to autocomplete the same pattern in its next turn.
PREVIEW_CAP = 5000
# Extract structural metadata (array lengths, key names)
@@ -316,21 +337,22 @@ def truncate_tool_result(
else:
preview_block = result.content[:PREVIEW_CAP] + ""
# Assemble header with structural info + warning
# Prose header (no brackets). Absolute path still surfaced
# so the agent can read the full file, but it's framed as
# a sentence, not a bracketed trailer.
header = (
f"[Result from {tool_name}: {len(result.content):,} chars — "
f"too large for context, saved to '{abs_path}'.]\n"
f"Tool `{tool_name}` returned {len(result.content):,} characters "
f"(too large for context). Full result saved at: {abs_path}\n"
f"Read the complete data with read_file(path='{abs_path}').\n"
)
if metadata_str:
header += f"\nData structure:\n{metadata_str}"
header += f"\nData structure:\n{metadata_str}\n"
header += (
f"\n\nWARNING: The preview below is INCOMPLETE. "
f"Do NOT draw conclusions or counts from it. "
f"Use read_file(path='{abs_path}') to read the "
f"full data before analysis."
"\nWARNING: the preview below is a SAMPLE only — do NOT "
"draw counts, totals, or conclusions from it."
)
content = f"{header}\n\nPreview (small sample only):\n{preview_block}"
content = f"{header}\n\nPreview (truncated):\n{preview_block}"
logger.info(
"Tool result spilled to file: %s (%d chars → %s)",
tool_name,
@@ -338,10 +360,22 @@ def truncate_tool_result(
abs_path,
)
else:
# Small result: keep full content + annotation with absolute path
content = f"{result.content}\n\n[Saved to '{abs_path}']"
# Small result: pass content through UNCHANGED.
#
# The prior design appended `\n\n[Saved to '/abs/path']`
# after every small result so the agent could re-read the
# file later. But (a) the full content is already in the
# message, so there's nothing to re-read; (b) the
# `[Saved to '…']` trailer is a repeating token pattern
# that frontier pattern-matching models autocomplete into
# their own assistant turns, eventually echoing whole tool
# results as "text" instead of making decisions. Dropping
# the trailer entirely kills the poison pattern. Spilled
# files on disk still exist for debugging — they just
# aren't advertised in the LLM-visible message.
content = result.content
logger.info(
"Tool result saved to file: %s (%d chars → %s)",
"Tool result saved to file: %s (%d chars → %s, no trailer)",
tool_name,
len(result.content),
filename,
@@ -373,15 +407,17 @@ def truncate_tool_result(
else:
preview_block = result.content[:PREVIEW_CAP] + ""
# Prose header (no brackets) — see docstring for the poison
# pattern that the bracket format triggered.
header = (
f"[Result from {tool_name}: {len(result.content):,} chars — "
f"truncated to fit context budget.]"
f"Tool `{tool_name}` returned {len(result.content):,} characters "
f"(truncated to fit context budget — no spillover dir configured)."
)
if metadata_str:
header += f"\n\nData structure:\n{metadata_str}"
header += (
"\n\nWARNING: This is an INCOMPLETE preview. "
"Do NOT draw conclusions or counts from the preview alone."
"\n\nWARNING: the preview below is a SAMPLE only — do NOT "
"draw counts, totals, or conclusions from it."
)
truncated = f"{header}\n\n{preview_block}"
+75 -31
View File
@@ -2,6 +2,7 @@
from __future__ import annotations
import asyncio
import json
import logging
import time
@@ -83,6 +84,23 @@ class LoopConfig:
max_tool_result_chars: int = 30_000
spillover_dir: str | None = None
# Image retention in conversation history.
# Screenshots from ``browser_screenshot`` are inlined as base64
# data URLs inside message ``image_content``. Each full-page
# screenshot costs ~250k tokens when the provider counts the
# base64 as text (gemini, most non-Anthropic providers). Four
# screenshots in one conversation push gemini's 1M context over
# the limit and the model starts emitting garbage.
#
# The framework strips image_content from older messages after
# every tool-result batch, keeping only the most recent N
# screenshots. The text metadata on evicted messages (url, size,
# scale hints) is preserved so the agent can still reason about
# "I took a screenshot at step N that showed the compose modal".
# Raise this only if you genuinely need longer visual history AND
# you know your provider is using native image tokenization.
max_retained_screenshots: int = 2
# set_output value spilling.
max_output_value_chars: int = 2_000
@@ -166,7 +184,7 @@ class OutputAccumulator:
async def set(self, key: str, value: Any) -> None:
"""Set a key-value pair, auto-spilling large values to files."""
value = self._auto_spill(key, value)
value = await self._auto_spill(key, value)
self.values[key] = value
if self.store:
cursor = await self.store.read_cursor() or {}
@@ -175,41 +193,67 @@ class OutputAccumulator:
cursor["outputs"] = outputs
await self.store.write_cursor(cursor)
def _auto_spill(self, key: str, value: Any) -> Any:
"""Save large values to a file and return a reference string."""
async def _auto_spill(self, key: str, value: Any) -> Any:
"""Save large values to a file and return a reference string.
Runs the JSON serialization and file write on a worker thread
so they don't block the asyncio event loop. For a 100k-char
dict this used to freeze every concurrent tool call for ~50ms
of ``json.dumps(indent=2)`` + a sync disk write; for bigger
payloads or slow storage (NFS, networked FS) the freeze was
proportionally worse.
"""
if self.max_value_chars <= 0 or not self.spillover_dir:
return value
val_str = json.dumps(value, ensure_ascii=False) if not isinstance(value, str) else value
if len(val_str) <= self.max_value_chars:
# Cheap size probe first — if the value is already a short
# string we can skip both the JSON round-trip and the thread
# hop entirely.
if isinstance(value, str) and len(value) <= self.max_value_chars:
return value
spill_path = Path(self.spillover_dir)
spill_path.mkdir(parents=True, exist_ok=True)
ext = ".json" if isinstance(value, (dict, list)) else ".txt"
filename = f"output_{key}{ext}"
write_content = (
json.dumps(value, indent=2, ensure_ascii=False)
if isinstance(value, (dict, list))
else str(value)
)
file_path = spill_path / filename
file_path.write_text(write_content, encoding="utf-8")
file_size = file_path.stat().st_size
logger.info(
"set_output value auto-spilled: key=%s, %d chars -> %s (%d bytes)",
key,
len(val_str),
filename,
file_size,
)
# Use absolute path so parent agents can find files from subagents
abs_path = str(file_path.resolve())
return (
f"[Saved to '{abs_path}' ({file_size:,} bytes). "
f"Use read_file(path='{abs_path}') "
f"to access full data.]"
)
def _spill_sync() -> Any:
# JSON serialization for size check (only for non-strings).
if isinstance(value, str):
val_str = value
else:
val_str = json.dumps(value, ensure_ascii=False)
if len(val_str) <= self.max_value_chars:
return value
spill_path = Path(self.spillover_dir)
spill_path.mkdir(parents=True, exist_ok=True)
ext = ".json" if isinstance(value, (dict, list)) else ".txt"
filename = f"output_{key}{ext}"
write_content = (
json.dumps(value, indent=2, ensure_ascii=False)
if isinstance(value, (dict, list))
else str(value)
)
file_path = spill_path / filename
file_path.write_text(write_content, encoding="utf-8")
file_size = file_path.stat().st_size
logger.info(
"set_output value auto-spilled: key=%s, %d chars -> %s (%d bytes)",
key,
len(val_str),
filename,
file_size,
)
# Use absolute path so parent agents can find files from subagents.
#
# Prose format (no brackets) — same fix as tool_result_handler:
# frontier pattern-matching models autocomplete bracketed
# `[Saved to '...']` trailers into their own assistant turns,
# eventually degenerating into echoing the file path as text.
# Keep the path accessible but frame it as plain prose.
abs_path = str(file_path.resolve())
return (
f"Output saved at: {abs_path} ({file_size:,} bytes). "
f"Read the full data with read_file(path='{abs_path}')."
)
return await asyncio.to_thread(_spill_sync)
def get(self, key: str) -> Any | None:
return self.values.get(key)
@@ -344,6 +344,51 @@ Reddit's search input lives **two shadow levels deep** inside `reddit-search-lar
After submitting, press Escape to close the composer.
## File uploads — use `browser_upload`, never click the upload button
**Clicking an `<input type="file">` or the button that triggers one (X's photo button, LinkedIn's attach button, Gmail's paperclip) opens Chrome's native OS file picker. That dialog is rendered by the operating system, NOT the page, so CDP cannot see it, cannot interact with it, and the automation wedges.** This is the single most common way to lock up a browser session on any "compose with media" flow.
**The only correct pattern:** call `browser_upload(selector, file_paths)`. It uses the CDP `DOM.setFileInputFiles` method, which sets the files directly on the input element's internal state as if the user had picked them — no OS dialog ever opens.
```
# WRONG — opens the native file picker, agent gets stuck
browser_click_coordinate(photo_button_x, photo_button_y) # ❌
# RIGHT — sets the file programmatically, no dialog
browser_upload(
selector="input[type='file']", # the underlying file input
file_paths=["/absolute/path/to/image.png"],
)
```
**Finding the file input.** On most modern SPAs the visible "Add photo" / "Attach" button is a styled `<button>` or `<label>`, and the real `<input type="file">` is hidden (often `display:none` or `opacity:0`, positioned offscreen, wrapped in a `<label for="...">`, or injected on click). Use `browser_evaluate` to enumerate ALL file inputs on the page first:
```python
browser_evaluate("""
(function(){
const inputs = Array.from(document.querySelectorAll('input[type="file"]'));
return inputs.map(el => ({
name: el.name || '',
accept: el.accept || '',
multiple: el.multiple,
id: el.id || '',
inViewport: (() => {
const r = el.getBoundingClientRect();
return r.width > 0 && r.height > 0;
})(),
}));
})();
""")
```
Then pass the most specific selector that uniquely identifies the right input (e.g. `input[type='file'][accept*='image']` for a photo-only upload). `browser_upload` doesn't care if the input is hidden or offscreen — `DOM.setFileInputFiles` works on any valid file input node, visible or not.
**X / LinkedIn / Twitter pattern.** On X (`x.com/compose/post`), the photo upload input is `input[data-testid='fileInput']` — hidden, reachable via `browser_upload`. On LinkedIn feed compose, look for `input[type='file'][accept*='image']` inside the post-creation modal after clicking "Add media" (clicking the Add-media button reveals the input but does NOT open the dialog; only clicking the SECOND layer — the "From computer" entry — would trigger the picker. Stop at the first layer, find the input, call `browser_upload`).
**Verification after upload.** `DOM.setFileInputFiles` dispatches a `change` event on the input but NOT the `click` / `focus` events that some sites gate their UI on. Always verify the upload actually took effect by screenshotting the composer (the uploaded image should appear as a preview) or by checking for a "preview" / "remove" element that only exists post-upload. If verification fails, the site may be reading the file via some other bridge — fall back to reading the file bytes and pasting them via the clipboard (`navigator.clipboard.write` with a `ClipboardItem`) through `browser_evaluate`.
**If a native file picker DOES open** (you clicked the wrong thing): there is no recovery via CDP. Press Escape via `browser_press("Escape")` immediately — this dismisses the OS dialog in Chrome on Linux/macOS. Then find the actual `<input type='file'>` and use `browser_upload`.
## Common pitfalls
- **Typing into a rich-text editor without clicking first → send button stays disabled.** Draft.js (X), Lexical (Gmail, LinkedIn DMs), ProseMirror (Reddit), and React-controlled `contenteditable` elements only register input as "real" when the element received a native focus event — JS-sourced `.focus()` is not enough. `browser_type` now does this automatically via a real CDP pointer click before inserting text, but always verify the submit button's `disabled` state before clicking send. See the "ALWAYS click before typing" section above.
@@ -354,6 +399,7 @@ After submitting, press Escape to close the composer.
- **Relying on `innerHTML` in injected scripts on LinkedIn.** Silently discarded. Use `createElement` + `appendChild`.
- **Not waiting for SPA hydration.** `wait_until="load"` fires before React/Vue rendering on many sites. Add a 23 s sleep before querying for chrome elements.
- **Using `browser_type(selector)` on LinkedIn DMs or any shadow-DOM input.** Won't find the element. Fall back to click-to-focus + `browser_press` per character.
- **Clicking a "Photo" / "Attach" / "Upload" button to pick a file.** This opens Chrome's NATIVE OS file picker, which is rendered outside the web page and cannot be interacted with via CDP. Your automation will hang staring at an unreachable dialog. ALWAYS use `browser_upload(selector, file_paths)` against the underlying `<input type='file'>` element — see the "File uploads" section above for the full pattern. This is the single most common way to wedge a browser session on compose-with-media flows (X/LinkedIn/Gmail).
- **Keyboard shortcuts without the `code` field.** Chrome's shortcut dispatcher ignores keyboard events that lack a `code` or `windowsVirtualKeyCode`. `browser_press(..., modifiers=[...])` populates these automatically; raw `Input.dispatchKeyEvent` calls from `browser_evaluate` may not.
- **Taking a screenshot more than 10s after the last interaction** and expecting the highlight to still be visible. The overlay fades after 10s. Take the screenshot sooner, or re-trigger the interaction.
- **Expecting `browser_navigate` to return when you specified `wait_until="networkidle"` on a busy site.** networkidle is approximate — some sites keep a websocket or analytics beacon open forever. Use `"load"` or `"domcontentloaded"` for reliable timing.
@@ -246,6 +246,60 @@ if state['found'] and not state['disabled']:
browser_click("button.share-actions__primary-action")
```
## Posting WITH an image attached
**Do NOT click the "Add media" / image icon inside the feed post composer to pick a file.** LinkedIn renders a styled button that opens Chrome's native OS file picker when clicked, and that dialog is unreachable via CDP — the automation will hang on an invisible modal. Use `browser_upload` directly against the hidden `<input type='file'>`:
```python
# After the post modal is open and the editor has text:
# (A) First, click "Add media" to surface the file input
# (clicking THIS button reveals the input but does NOT itself open
# the OS picker on current LinkedIn — the picker only opens if
# you click the inner "Choose from your device" entry).
media_btn = browser_get_rect("button[aria-label*='image'], button[aria-label*='photo']")
browser_click_coordinate(media_btn.cx, media_btn.cy)
sleep(0.8)
# (B) Enumerate file inputs to find the right one
inputs = browser_evaluate("""
(function(){
return Array.from(document.querySelectorAll('input[type="file"]'))
.map((el, i) => ({
idx: i,
accept: el.accept || '',
name: el.name || '',
}));
})();
""")
# Expect to see one with accept='image/*' or accept containing 'image/jpeg'
# (C) Set the file programmatically — no dialog
browser_upload(
selector="input[type='file'][accept*='image']",
file_paths=["/absolute/path/to/logo.png"],
)
sleep(3) # LinkedIn shows an upload-progress bar + preview
# (D) Verify the image preview rendered before clicking Post
preview_ok = browser_evaluate("""
(function(){
// LinkedIn shows the preview as an <img> inside
// .share-creation-state__image-preview or similar.
return !!document.querySelector(
'.share-creation-state__preview img, .image-preview-container img'
);
})();
""")
if not preview_ok:
raise Exception("LinkedIn image upload did not render — do NOT click Post")
# (E) Now click Post as usual
browser_click("button.share-actions__primary-action")
sleep(4) # media post takes longer to commit than text-only
```
If the image isn't already on disk, write it first with `write_file(absolute_path, bytes)`. `browser_upload` only accepts absolute paths.
## Rate limits and safety
LinkedIn's abuse detection is aggressive. Respect these limits:
@@ -79,6 +79,61 @@ if state['found'] and not state['disabled']:
browser_press("Escape") # close any leftover modal
```
## Posting a tweet WITH an image
**Critical: NEVER click the photo button.** On `x.com/compose/post` the media button is a styled `<button>` that triggers Chrome's native OS file picker when clicked — that dialog is unreachable via CDP and will wedge the automation. Instead, set the file directly on the hidden `<input type='file'>` element using `browser_upload`:
```python
# 1. Open the compose modal as usual
browser_press("n")
sleep(1.5)
browser_click_coordinate(ta_rect.cx, ta_rect.cy)
sleep(0.5)
browser_type("[data-testid='tweetTextarea_0']", tweet_text)
# 2. Find the hidden file input X uses for media uploads.
# X's input is marked with data-testid='fileInput' and accepts
# image/*,video/*. It's hidden (display:none) but still mounted.
inputs = browser_evaluate("""
(function(){
return Array.from(document.querySelectorAll('input[type="file"]'))
.map(el => ({
testid: el.getAttribute('data-testid') || '',
accept: el.accept || '',
multiple: el.multiple,
}));
})();
""")
# Expect to see: [{testid: 'fileInput', accept: 'image/jpeg,...', multiple: true}]
# 3. Set the file WITHOUT opening any dialog
browser_upload(
selector="input[data-testid='fileInput']",
file_paths=["/absolute/path/to/photo.png"],
)
sleep(2) # X takes ~1-2s to show the preview thumbnail
# 4. Verify the preview rendered before posting — if not, the upload
# didn't land and Post button will fail.
preview = browser_evaluate("""
(function(){
// X renders uploaded media as an <img> with data-testid='attachments'
// (or similar) inside the composer.
const att = document.querySelector('[data-testid="attachments"] img');
return { hasPreview: !!att };
})();
""")
if not preview['hasPreview']:
raise Exception("Upload didn't render in composer — do NOT click Post")
# 5. Now click Post as usual
browser_click("[data-testid='tweetButton']")
sleep(3) # media upload + post takes longer than text-only
browser_press("Escape")
```
If you don't already have the image file on disk, write it first: `write_file("/tmp/x_upload.png", base64_bytes)` or copy from a known location. `browser_upload` requires an absolute file path — relative paths and `~` expansion are not supported.
## Reply to a post flow
The reply flow is the same shape as posting, with a few scroll / find-and-click steps before.
+142 -21
View File
@@ -42,6 +42,39 @@ BRIDGE_PORT = 9229
# CDP wait_until values
VALID_WAIT_UNTIL = {"commit", "domcontentloaded", "load", "networkidle"}
# Fast-fail polling default for element / text waits. 5 seconds is long
# enough to cover normal SPA render latency on loaded pages, short enough
# that a bad selector or hallucinated element fails fast instead of
# burning 30 wall-clock seconds per miss (the old behavior — see the
# 2026-04-14 gemini-3-flash x.com session where 7 of 14 browser_click
# calls each hit the 30s deadline for ~210s wasted total).
#
# navigate() keeps a longer default (30s) because real page loads can
# legitimately take that long.
DEFAULT_WAIT_TIMEOUT_MS: int = 5000
# Longer default for bridge _send calls that wrap genuinely slow ops
# (full-page screenshot, accessibility tree, navigate). Individual
# callers can pass their own value via _send(..., timeout=...).
_LONG_SEND_TIMEOUT_S: float = 60.0
async def _adaptive_poll_sleep(elapsed_s: float) -> None:
"""Sleep between DOM polls with an adaptive backoff.
Early polls are snappy (50ms) so a quickly-appearing element is
reported in ~100ms. Later polls back off (200ms, 500ms) so a
missing element doesn't thrash CDP with 300+ querySelector calls
before the deadline fires.
"""
if elapsed_s < 1.0:
await asyncio.sleep(0.05)
elif elapsed_s < 5.0:
await asyncio.sleep(0.2)
else:
await asyncio.sleep(0.5)
# Last interaction highlight per tab_id: {x, y, w, h, label, kind}
# kind: "rect" (element) or "point" (coordinate)
_interaction_highlights: dict[int, dict] = {}
@@ -296,9 +329,23 @@ class BeelineBridge:
msg = str(exc).lower()
return any(m in msg for m in self._CDP_DEAD_SESSION_MARKERS)
async def _cdp(self, tab_id: int, method: str, params: dict | None = None) -> dict:
async def _cdp(
self,
tab_id: int,
method: str,
params: dict | None = None,
*,
timeout: float | None = None,
) -> dict:
"""Send a CDP command to a tab.
``timeout`` (seconds) overrides the default bridge send timeout.
Pass a larger value for genuinely slow operations (full-page
screenshots over slow networks, accessibility tree on huge
pages) so they don't spuriously fail at the 30s floor. Pass a
smaller value for fast probes ("is this element present right
now") to fail fast.
On a dead-session error (Chrome detached externally tab closed,
DevTools opened, cross-origin nav), evict the stale attach
cache entry, reattach, and retry once. Without this the Python
@@ -307,7 +354,13 @@ class BeelineBridge:
"""
start = time.perf_counter()
try:
result = await self._send("cdp", tabId=tab_id, method=method, params=params or {})
result = await self._send(
"cdp",
tabId=tab_id,
method=method,
params=params or {},
timeout=timeout,
)
duration_ms = (time.perf_counter() - start) * 1000
log_cdp_command(tab_id, method, params, result, duration_ms=duration_ms)
return result
@@ -327,7 +380,11 @@ class BeelineBridge:
self._cdp_attached.add(tab_id)
retry_start = time.perf_counter()
result = await self._send(
"cdp", tabId=tab_id, method=method, params=params or {}
"cdp",
tabId=tab_id,
method=method,
params=params or {},
timeout=timeout,
)
log_cdp_command(
tab_id,
@@ -594,10 +651,16 @@ class BeelineBridge:
selector: str,
button: str = "left",
click_count: int = 1,
timeout_ms: int = 30000,
timeout_ms: int = DEFAULT_WAIT_TIMEOUT_MS,
) -> dict:
"""Click an element by selector.
``timeout_ms`` controls how long we poll for the element to
appear in the DOM. Defaults to :data:`DEFAULT_WAIT_TIMEOUT_MS`
(5 s) so a missing or hallucinated selector fails fast. Pass a
larger value when the target genuinely needs longer to render
(e.g. post-navigation SPA hydration).
Uses multiple fallback methods for robustness:
1. CDP mouse events with JavaScript bounds
2. JavaScript click() as fallback
@@ -612,8 +675,12 @@ class BeelineBridge:
doc = await self._cdp(tab_id, "DOM.getDocument")
root_id = doc.get("root", {}).get("nodeId")
# Wait for element to appear
deadline = asyncio.get_event_loop().time() + timeout_ms / 1000
# Wait for element to appear. Adaptive polling:
# - first 1 s at 50 ms intervals (responsive on fast pages)
# - next 4 s at 200 ms
# - rest at 500 ms
poll_start = asyncio.get_event_loop().time()
deadline = poll_start + timeout_ms / 1000
node_id = None
while asyncio.get_event_loop().time() < deadline:
result = await self._cdp(
@@ -622,7 +689,7 @@ class BeelineBridge:
node_id = result.get("nodeId")
if node_id:
break
await asyncio.sleep(0.1)
await _adaptive_poll_sleep(asyncio.get_event_loop().time() - poll_start)
if not node_id:
# Check if the element might be inside a Shadow DOM container
@@ -773,7 +840,11 @@ class BeelineBridge:
)
await asyncio.sleep(0.05)
# Mouse down
# Mouse down — if this hangs past the short wait budget we
# CANNOT claim success. The prior code swallowed TimeoutError
# with `pass` and returned ok=true further down, which is why
# the 2026-04-14 gemini session saw 7 clicks land at exactly
# 30s with status=ok even though the click had not landed.
try:
await asyncio.wait_for(
self._cdp(
@@ -787,14 +858,24 @@ class BeelineBridge:
"clickCount": click_count,
},
),
timeout=1.0,
timeout=2.0,
)
except TimeoutError:
pass # Continue even if timeout
return {
"ok": False,
"error": (
f"CDP mousePressed timed out for '{selector}'"
"the click did not land. Consider browser_click_coordinate "
"with an explicit rect from browser_get_rect."
),
}
await asyncio.sleep(0.08)
# Mouse up
# Mouse up — same non-silent failure handling. A stuck
# mouseReleased means the press is still "held down" in
# Chrome's input state; we must surface the failure so the
# caller can retry or switch strategy.
try:
await asyncio.wait_for(
self._cdp(
@@ -811,7 +892,14 @@ class BeelineBridge:
timeout=3.0,
)
except TimeoutError:
pass # Continue even if timeout
return {
"ok": False,
"error": (
f"CDP mouseReleased timed out for '{selector}'"
"the press event fired but release did not. The page "
"may be in a stuck input state; try browser_click_coordinate."
),
}
w = bounds_value.get("width", 0)
h = bounds_value.get("height", 0)
@@ -2174,7 +2262,19 @@ class BeelineBridge:
"scale": 1,
}
result = await self._cdp(tab_id, "Page.captureScreenshot", params)
# Pass the outer screenshot timeout budget to the
# underlying CDP call. Full-page screenshots over slow
# networks can legitimately take 20-40s; the default 30s
# _send floor used to make them fail spuriously right at
# the boundary. We give the CDP call the full timeout_s
# budget so the outer `asyncio.timeout(timeout_s)` is
# the only authority on how long we wait.
result = await self._cdp(
tab_id,
"Page.captureScreenshot",
params,
timeout=timeout_s,
)
data = result.get("data")
if not data:
@@ -2249,8 +2349,18 @@ class BeelineBridge:
logger.error("Screenshot failed: %s", e)
return {"ok": False, "error": str(e)}
async def wait_for_selector(self, tab_id: int, selector: str, timeout_ms: int = 30000) -> dict:
"""Wait for an element to appear."""
async def wait_for_selector(
self,
tab_id: int,
selector: str,
timeout_ms: int = DEFAULT_WAIT_TIMEOUT_MS,
) -> dict:
"""Wait for an element to appear.
Default 5 s fast-fail. Callers that need to wait longer (e.g.
a known slow post-navigation render) should pass an explicit
``timeout_ms``.
"""
await self.cdp_attach(tab_id)
script = f"""
@@ -2259,7 +2369,8 @@ class BeelineBridge:
}})()
"""
deadline = asyncio.get_event_loop().time() + timeout_ms / 1000
poll_start = asyncio.get_event_loop().time()
deadline = poll_start + timeout_ms / 1000
while asyncio.get_event_loop().time() < deadline:
result = await self._cdp(
tab_id,
@@ -2272,12 +2383,21 @@ class BeelineBridge:
found = (result or {}).get("result", {}).get("value", False)
if found:
return {"ok": True, "selector": selector}
await asyncio.sleep(0.1)
await _adaptive_poll_sleep(asyncio.get_event_loop().time() - poll_start)
return {"ok": False, "error": f"Element not found within timeout: {selector}"}
async def wait_for_text(self, tab_id: int, text: str, timeout_ms: int = 30000) -> dict:
"""Wait for text to appear on the page."""
async def wait_for_text(
self,
tab_id: int,
text: str,
timeout_ms: int = DEFAULT_WAIT_TIMEOUT_MS,
) -> dict:
"""Wait for text to appear on the page.
Default 5 s fast-fail. Same fast-fail rationale as
:meth:`wait_for_selector`.
"""
await self.cdp_attach(tab_id)
script = f"""
@@ -2286,7 +2406,8 @@ class BeelineBridge:
}})()
"""
deadline = asyncio.get_event_loop().time() + timeout_ms / 1000
poll_start = asyncio.get_event_loop().time()
deadline = poll_start + timeout_ms / 1000
while asyncio.get_event_loop().time() < deadline:
result = await self._cdp(
tab_id,
@@ -2297,7 +2418,7 @@ class BeelineBridge:
found = (result or {}).get("result", {}).get("value", False)
if found:
return {"ok": True, "text": text}
await asyncio.sleep(0.1)
await _adaptive_poll_sleep(asyncio.get_event_loop().time() - poll_start)
return {"ok": False, "error": f"Text not found within timeout: {text}"}
+8 -2
View File
@@ -28,7 +28,7 @@ def register_advanced_tools(mcp: FastMCP) -> None:
text: str | None = None,
tab_id: int | None = None,
profile: str | None = None,
timeout_ms: int = 30000,
timeout_ms: int = 5000,
) -> dict:
"""
Wait for a condition.
@@ -39,7 +39,13 @@ def register_advanced_tools(mcp: FastMCP) -> None:
text: Wait for text to appear on page (optional)
tab_id: Chrome tab ID (default: active tab)
profile: Browser profile name (default: "default")
timeout_ms: Max wait time in ms (default: 30000)
timeout_ms: Max wait time in ms for the selector/text poll.
Default 5000ms (fast-fail). If the condition isn't met
within 5s the call returns {"ok": False, "error": ...}
and the agent can try a different approach instead of
burning 30s per miss. Pass a larger value (e.g. 15000)
only when you genuinely expect the element to take
longer than 5s to render.
Returns:
Dict with wait result
+15 -3
View File
@@ -6,6 +6,7 @@ All operations go through the Beeline extension via CDP - no Playwright required
from __future__ import annotations
import asyncio
import base64
import io
import json
@@ -277,9 +278,20 @@ def register_inspection_tools(mcp: FastMCP) -> None:
if annotate and target_tab in _interaction_highlights:
highlights = [_interaction_highlights[target_tab]]
# Normalize to 800px wide and annotate
data, physical_scale, css_scale = _resize_and_annotate(
data, css_width, dpr=dpr, highlights=highlights, width=width
# Normalize to 800px wide and annotate. Offloaded to a
# thread because PIL Image.open/resize/ImageDraw/composite on
# a 2-megapixel PNG blocks for ~150-300ms of CPU — plenty to
# freeze the asyncio event loop and delay every concurrent
# tool call during a screenshot. The function is reentrant
# (fresh PIL Image per call, no shared state), so to_thread
# is safe.
data, physical_scale, css_scale = await asyncio.to_thread(
_resize_and_annotate,
data,
css_width,
dpr,
highlights,
width,
)
_screenshot_scales[target_tab] = physical_scale
_screenshot_css_scales[target_tab] = css_scale
+8 -2
View File
@@ -30,7 +30,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
profile: str | None = None,
button: Literal["left", "right", "middle"] = "left",
double_click: bool = False,
timeout_ms: int = 30000,
timeout_ms: int = 5000,
) -> dict:
"""
Click an element on the page.
@@ -41,7 +41,13 @@ def register_interaction_tools(mcp: FastMCP) -> None:
profile: Browser profile name (default: "default")
button: Mouse button to click (left, right, middle)
double_click: Perform double-click (default: False)
timeout_ms: Timeout waiting for element (default: 30000)
timeout_ms: How long to poll for the element to appear in the
DOM before giving up. Default 5000ms (fast-fail). A missing
or hallucinated selector returns "Element not found" in
<=5s so the agent can try a different approach quickly.
Pass a larger value (e.g. 15000) ONLY when you know the
element will take longer than 5s to render for example
right after a navigation that triggers slow hydration.
Returns:
Dict with click result and coordinates