Merge pull request #6682 from aden-hive/feat/image-capabilities
Release / Create Release (push) Waiting to run

feat: image capabilities — upload, screenshot passthrough, vision detection & fallback, aria refs
This commit is contained in:
Timothy @aden
2026-03-20 21:25:37 -07:00
committed by GitHub
28 changed files with 1586 additions and 178 deletions
@@ -702,6 +702,15 @@ stop_worker() to return to STAGING phase.
_queen_behavior_always = """
# Behavior
## Images attached by the user
Users can attach images directly to their chat messages. When you see an \
image in the conversation, analyze it using your native vision capability \
do NOT say you cannot see images or that you lack access to files. The image \
is embedded in the message; no tool call is needed to view it. Describe what \
you see, answer questions about it, and use the visual content to inform your \
response just as you would text.
## CRITICAL RULE — ask_user / ask_user_multiple
Every response that ends with a question, a prompt, or expects user \
@@ -150,7 +150,7 @@ Call all three subagents in a single response to run them in parallel:
## GCU Anti-Patterns
- Using `browser_screenshot` to read text (use `browser_snapshot`)
- Using `browser_screenshot` to read text (use `browser_snapshot` instead; screenshots are for visual context only)
- Re-navigating after scrolling (resets scroll position)
- Attempting login on auth walls
- Forgetting `target_id` in multi-tab scenarios
+24
View File
@@ -33,12 +33,20 @@ class Message:
is_transition_marker: bool = False
# True when this message is real human input (from /chat), not a system prompt
is_client_input: bool = False
# Optional image content blocks (e.g. from browser_screenshot)
image_content: list[dict[str, Any]] | None = None
# True when message contains an activated skill body (AS-10: never prune)
is_skill_content: bool = False
def to_llm_dict(self) -> dict[str, Any]:
"""Convert to OpenAI-format message dict."""
if self.role == "user":
if self.image_content:
blocks: list[dict[str, Any]] = []
if self.content:
blocks.append({"type": "text", "text": self.content})
blocks.extend(self.image_content)
return {"role": "user", "content": blocks}
return {"role": "user", "content": self.content}
if self.role == "assistant":
@@ -49,6 +57,15 @@ class Message:
# role == "tool"
content = f"ERROR: {self.content}" if self.is_error else self.content
if self.image_content:
# Multimodal tool result: text + image content blocks
blocks: list[dict[str, Any]] = [{"type": "text", "text": content}]
blocks.extend(self.image_content)
return {
"role": "tool",
"tool_call_id": self.tool_use_id,
"content": blocks,
}
return {
"role": "tool",
"tool_call_id": self.tool_use_id,
@@ -74,6 +91,8 @@ class Message:
d["is_transition_marker"] = self.is_transition_marker
if self.is_client_input:
d["is_client_input"] = self.is_client_input
if self.image_content is not None:
d["image_content"] = self.image_content
return d
@classmethod
@@ -89,6 +108,7 @@ class Message:
phase_id=data.get("phase_id"),
is_transition_marker=data.get("is_transition_marker", False),
is_client_input=data.get("is_client_input", False),
image_content=data.get("image_content"),
)
@@ -375,6 +395,7 @@ class NodeConversation:
*,
is_transition_marker: bool = False,
is_client_input: bool = False,
image_content: list[dict[str, Any]] | None = None,
) -> Message:
msg = Message(
seq=self._next_seq,
@@ -383,6 +404,7 @@ class NodeConversation:
phase_id=self._current_phase,
is_transition_marker=is_transition_marker,
is_client_input=is_client_input,
image_content=image_content,
)
self._messages.append(msg)
self._next_seq += 1
@@ -411,6 +433,7 @@ class NodeConversation:
tool_use_id: str,
content: str,
is_error: bool = False,
image_content: list[dict[str, Any]] | None = None,
is_skill_content: bool = False,
) -> Message:
msg = Message(
@@ -420,6 +443,7 @@ class NodeConversation:
tool_use_id=tool_use_id,
is_error=is_error,
phase_id=self._current_phase,
image_content=image_content,
is_skill_content=is_skill_content,
)
self._messages.append(msg)
+126 -9
View File
@@ -14,6 +14,7 @@ from __future__ import annotations
import asyncio
import json
import logging
import os
import re
import time
from collections.abc import Awaitable, Callable
@@ -24,6 +25,7 @@ from typing import Any, Literal, Protocol, runtime_checkable
from framework.graph.conversation import ConversationStore, NodeConversation
from framework.graph.node import NodeContext, NodeProtocol, NodeResult
from framework.llm.capabilities import supports_image_tool_results
from framework.llm.provider import Tool, ToolResult, ToolUse
from framework.llm.stream_events import (
FinishEvent,
@@ -37,6 +39,56 @@ from framework.runtime.llm_debug_logger import log_llm_turn
logger = logging.getLogger(__name__)
async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> str | None:
"""Describe images using the best available vision model.
Called when the queen's model lacks vision support. Tries vision-capable
models in priority order based on available API keys and returns a bracketed
description to inject into the message text, or None if no vision model is
reachable.
"""
import litellm
# Build content blocks: prompt + all images
blocks: list[dict[str, Any]] = [
{
"type": "text",
"text": (
"Describe the following image(s) concisely but with enough detail "
"that a text-only AI assistant can understand the content and context."
),
}
]
blocks.extend(image_content)
# Ordered candidates based on available env vars
candidates: list[str] = []
if os.environ.get("OPENAI_API_KEY"):
candidates.append("gpt-4o-mini")
if os.environ.get("ANTHROPIC_API_KEY"):
candidates.append("claude-3-haiku-20240307")
if os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY"):
candidates.append("gemini/gemini-1.5-flash")
for model in candidates:
try:
response = await litellm.acompletion(
model=model,
messages=[{"role": "user", "content": blocks}],
max_tokens=512,
)
description = (response.choices[0].message.content or "").strip()
if description:
count = len(image_content)
label = "image" if count == 1 else f"{count} images"
return f"[{label} attached — description: {description}]"
except Exception as exc:
logger.debug("Vision fallback model '%s' failed: %s", model, exc)
continue
return None
@dataclass
class TriggerEvent:
"""A framework-level trigger signal (timer tick or webhook hit).
@@ -90,7 +142,13 @@ class _EscalationReceiver:
self._response: str | None = None
self._awaiting_input = True # So inject_worker_message() can prefer us
async def inject_event(self, content: str, *, is_client_input: bool = False) -> None:
async def inject_event(
self,
content: str,
*,
is_client_input: bool = False,
image_content: list[dict] | None = None,
) -> None:
"""Called by ExecutionStream.inject_input() when the user responds."""
self._response = content
self._event.set()
@@ -426,7 +484,9 @@ class EventLoopNode(NodeProtocol):
self._config = config or LoopConfig()
self._tool_executor = tool_executor
self._conversation_store = conversation_store
self._injection_queue: asyncio.Queue[tuple[str, bool]] = asyncio.Queue()
self._injection_queue: asyncio.Queue[tuple[str, bool, list[dict[str, Any]] | None]] = (
asyncio.Queue()
)
self._trigger_queue: asyncio.Queue[TriggerEvent] = asyncio.Queue()
# Client-facing input blocking state
self._input_ready = asyncio.Event()
@@ -784,7 +844,7 @@ class EventLoopNode(NodeProtocol):
)
# 6b. Drain injection queue
await self._drain_injection_queue(conversation)
await self._drain_injection_queue(conversation, ctx)
# 6b1. Drain trigger queue (framework-level signals)
await self._drain_trigger_queue(conversation)
@@ -1910,7 +1970,13 @@ class EventLoopNode(NodeProtocol):
conversation=conversation if _is_continuous else None,
)
async def inject_event(self, content: str, *, is_client_input: bool = False) -> None:
async def inject_event(
self,
content: str,
*,
is_client_input: bool = False,
image_content: list[dict[str, Any]] | None = None,
) -> None:
"""Inject an external event or user input into the running loop.
The content becomes a user message prepended to the next iteration.
@@ -1926,8 +1992,10 @@ class EventLoopNode(NodeProtocol):
human user (e.g. /chat endpoint), False for external events
(e.g. worker question forwarded by the frontend). Controls
message formatting in _drain_injection_queue, not wake behavior.
image_content: Optional list of image content blocks (OpenAI
image_url format) to include alongside the text.
"""
await self._injection_queue.put((content, is_client_input))
await self._injection_queue.put((content, is_client_input, image_content))
self._input_ready.set()
async def inject_trigger(self, trigger: TriggerEvent) -> None:
@@ -2101,6 +2169,24 @@ class EventLoopNode(NodeProtocol):
messages = conversation.to_llm_messages()
# Debug: log whether the last user message contains image blocks
for _m in reversed(messages):
if _m.get("role") == "user":
_content = _m.get("content")
if isinstance(_content, list):
_img_count = sum(
1
for _b in _content
if isinstance(_b, dict) and _b.get("type") == "image_url"
)
if _img_count:
logger.info(
"[%s] LLM call: last user message has %d image block(s)",
node_id,
_img_count,
)
break
# Defensive guard: ensure messages don't end with an assistant
# message. The Anthropic API rejects "assistant message prefill"
# (conversations must end with a user or tool message). This can
@@ -2770,10 +2856,21 @@ class EventLoopNode(NodeProtocol):
real_tool_results.append(tool_entry)
logged_tool_calls.append(tool_entry)
# Strip image content for models that can't handle it
image_content = result.image_content
if image_content and ctx.llm and not supports_image_tool_results(ctx.llm.model):
logger.info(
"Stripping image_content from tool result — model '%s' "
"does not support images in tool results",
ctx.llm.model,
)
image_content = None
await conversation.add_tool_result(
tool_use_id=tc.tool_use_id,
content=result.content,
is_error=result.is_error,
image_content=image_content,
is_skill_content=result.is_skill_content,
)
if (
@@ -3914,6 +4011,7 @@ class EventLoopNode(NodeProtocol):
tool_use_id=result.tool_use_id,
content=truncated,
is_error=False,
image_content=result.image_content,
)
spill_dir = self._config.spillover_dir
@@ -3986,6 +4084,7 @@ class EventLoopNode(NodeProtocol):
tool_use_id=result.tool_use_id,
content=content,
is_error=False,
image_content=result.image_content,
)
# No spillover_dir — truncate in-place if needed
@@ -4028,6 +4127,7 @@ class EventLoopNode(NodeProtocol):
tool_use_id=result.tool_use_id,
content=truncated,
is_error=False,
image_content=result.image_content,
)
return result
@@ -4698,20 +4798,37 @@ class EventLoopNode(NodeProtocol):
]
await self._conversation_store.write_cursor(cursor)
async def _drain_injection_queue(self, conversation: NodeConversation) -> int:
async def _drain_injection_queue(self, conversation: NodeConversation, ctx: NodeContext) -> int:
"""Drain all pending injected events as user messages. Returns count."""
count = 0
while not self._injection_queue.empty():
try:
content, is_client_input = self._injection_queue.get_nowait()
content, is_client_input, image_content = self._injection_queue.get_nowait()
logger.info(
"[drain] injected message (client_input=%s): %s",
"[drain] injected message (client_input=%s, images=%d): %s",
is_client_input,
len(image_content) if image_content else 0,
content[:200] if content else "(empty)",
)
# For models that don't support images, fall back to a text description
if image_content and ctx.llm:
if not supports_image_tool_results(ctx.llm.model):
logger.info(
"Model '%s' does not support images — attempting vision fallback",
ctx.llm.model,
)
description = await _describe_images_as_text(image_content)
if description:
content = f"{content}\n\n{description}" if content else description
logger.info("[drain] image described as text via vision fallback")
else:
logger.info("[drain] no vision fallback available — images dropped")
image_content = None
# Real user input is stored as-is; external events get a prefix
if is_client_input:
await conversation.add_user_message(content, is_client_input=True)
await conversation.add_user_message(
content, is_client_input=True, image_content=image_content
)
else:
await conversation.add_user_message(f"[External event]: {content}")
count += 1
+5 -2
View File
@@ -43,8 +43,11 @@ Follow these rules for reliable, efficient browser interaction.
`browser_snapshot` separately after every action.
Only call `browser_snapshot` when you need a fresh view without
performing an action, or after setting `auto_snapshot=false`.
- Do NOT use `browser_screenshot` for reading text content
it produces huge base64 images with no searchable text.
- Do NOT use `browser_screenshot` to read text use
`browser_snapshot` for that (compact, searchable, fast).
- DO use `browser_screenshot` when you need visual context:
charts, images, canvas elements, layout verification, or when
the snapshot doesn't capture what you need.
- Only fall back to `browser_get_text` for extracting specific
small elements by CSS selector.
+106
View File
@@ -0,0 +1,106 @@
"""Model capability checks for LLM providers.
Vision support rules are derived from official vendor documentation:
- ZAI (z.ai): docs.z.ai/guides/vlm GLM-4.6V variants are vision; GLM-5/4.6/4.7 are text-only
- MiniMax: platform.minimax.io/docs minimax-vl-01 is vision; M2.x are text-only
- DeepSeek: api-docs.deepseek.com deepseek-vl2 is vision; chat/reasoner are text-only
- Cerebras: inference-docs.cerebras.ai no vision models at all
- Groq: console.groq.com/docs/vision vision capable; treat as supported by default
- Ollama/LM Studio/vLLM/llama.cpp: local runners denied by default; model names
don't reliably indicate vision support, so users must configure explicitly
"""
from __future__ import annotations
def _model_name(model: str) -> str:
"""Return the bare model name after stripping any 'provider/' prefix."""
if "/" in model:
return model.split("/", 1)[1]
return model
# Step 1: explicit vision allow-list — these always support images regardless
# of what the provider-level rules say. Checked first so that e.g. glm-4.6v
# is allowed even though glm-4.6 is denied.
_VISION_ALLOW_BARE_PREFIXES: tuple[str, ...] = (
# ZAI/GLM vision models (docs.z.ai/guides/vlm)
"glm-4v", # GLM-4V series (legacy)
"glm-4.6v", # GLM-4.6V, GLM-4.6V-flash, GLM-4.6V-flashx
# DeepSeek vision models
"deepseek-vl", # deepseek-vl2, deepseek-vl2-small, deepseek-vl2-tiny
# MiniMax vision model
"minimax-vl", # minimax-vl-01
)
# Step 2: provider-level deny — every model from this provider is text-only.
_TEXT_ONLY_PROVIDER_PREFIXES: tuple[str, ...] = (
# Cerebras: inference-docs.cerebras.ai lists only text models
"cerebras/",
# Local runners: model names don't reliably indicate vision support
"ollama/",
"ollama_chat/",
"lm_studio/",
"vllm/",
"llamacpp/",
)
# Step 3: per-model deny — text-only models within otherwise mixed providers.
# Matched against the bare model name (provider prefix stripped, lower-cased).
# The vision allow-list above is checked first, so vision variants of the same
# family are already handled before these deny patterns are reached.
_TEXT_ONLY_MODEL_BARE_PREFIXES: tuple[str, ...] = (
# --- ZAI / GLM family ---
# text-only: glm-5, glm-4.6, glm-4.7, glm-4.5, zai-glm-*
# vision: glm-4v, glm-4.6v (caught by allow-list above)
"glm-5",
"glm-4.6", # bare glm-4.6 is text-only; glm-4.6v is caught by allow-list
"glm-4.7",
"glm-4.5",
"zai-glm",
# --- DeepSeek ---
# text-only: deepseek-chat, deepseek-coder, deepseek-reasoner
# vision: deepseek-vl2 (caught by allow-list above)
# Note: LiteLLM's deepseek handler may flatten content lists for some models;
# VL models are allowed through and rely on LiteLLM's native VL support.
"deepseek-chat",
"deepseek-coder",
"deepseek-reasoner",
# --- MiniMax ---
# text-only: minimax-m2.*, minimax-text-*, abab* (legacy)
# vision: minimax-vl-01 (caught by allow-list above)
"minimax-m2",
"minimax-text",
"abab",
)
def supports_image_tool_results(model: str) -> bool:
"""Return whether *model* can receive image content in messages.
Used to gate both user-message images and tool-result image blocks.
Logic (checked in order):
1. Vision allow-list True (known vision model, skip all denies)
2. Provider deny False (entire provider is text-only)
3. Model deny False (specific text-only model within a mixed provider)
4. Default True (assume capable; unknown providers and models)
"""
model_lower = model.lower()
bare = _model_name(model_lower)
# 1. Explicit vision allow — takes priority over all denies
if any(bare.startswith(p) for p in _VISION_ALLOW_BARE_PREFIXES):
return True
# 2. Provider-level deny (all models from this provider are text-only)
if any(model_lower.startswith(p) for p in _TEXT_ONLY_PROVIDER_PREFIXES):
return False
# 3. Per-model deny (text-only variants within mixed-capability families)
if any(bare.startswith(p) for p in _TEXT_ONLY_MODEL_BARE_PREFIXES):
return False
# 5. Default: assume vision capable
# Covers: OpenAI, Anthropic, Google, Mistral, Kimi, and other hosted providers
return True
+1
View File
@@ -45,6 +45,7 @@ class ToolResult:
tool_use_id: str
content: str
is_error: bool = False
image_content: list[dict[str, Any]] | None = None
is_skill_content: bool = False # AS-10: marks activated skill body, protected from pruning
+23 -10
View File
@@ -509,17 +509,30 @@ class MCPClient:
error_text = content_item.text
raise RuntimeError(f"MCP tool '{tool_name}' failed: {error_text}")
# Extract content
# Extract content — preserve image blocks alongside text
if result.content:
# MCP returns content as a list of content items
if len(result.content) > 0:
content_item = result.content[0]
# Check if it's a text content item
if hasattr(content_item, "text"):
return content_item.text
elif hasattr(content_item, "data"):
return content_item.data
return result.content
text_parts: list[str] = []
image_parts: list[dict[str, Any]] = []
for item in result.content:
if hasattr(item, "text"):
text_parts.append(item.text)
elif hasattr(item, "data") and hasattr(item, "mimeType"):
# MCP ImageContent — preserve as structured image block
image_parts.append(
{
"type": "image_url",
"image_url": {
"url": f"data:{item.mimeType};base64,{item.data}",
},
}
)
elif hasattr(item, "data"):
text_parts.append(str(item.data))
text = "\n".join(text_parts) if text_parts else ""
if image_parts:
return {"_text": text, "_images": image_parts}
return text if text else None
return None
+10 -1
View File
@@ -245,6 +245,13 @@ class ToolRegistry:
def _wrap_result(tool_use_id: str, result: Any) -> ToolResult:
if isinstance(result, ToolResult):
return result
# MCP client returns dict with _images when image content is present
if isinstance(result, dict) and "_images" in result:
return ToolResult(
tool_use_id=tool_use_id,
content=result.get("_text", ""),
image_content=result["_images"],
)
return ToolResult(
tool_use_id=tool_use_id,
content=json.dumps(result) if not isinstance(result, str) else result,
@@ -572,7 +579,9 @@ class ToolRegistry:
}
merged_inputs = {**clean_inputs, **filtered_context}
result = client_ref.call_tool(tool_name, merged_inputs)
# MCP tools return content array, extract the result
# MCP client already extracts content (returns str
# or {"_text": ..., "_images": ...} for image results).
# Handle legacy list format from HTTP transport.
if isinstance(result, list) and len(result) > 0:
if isinstance(result[0], dict) and "text" in result[0]:
return result[0]["text"]
+9 -2
View File
@@ -1474,6 +1474,7 @@ class AgentRuntime:
graph_id: str | None = None,
*,
is_client_input: bool = False,
image_content: list[dict[str, Any]] | None = None,
) -> bool:
"""Inject user input into a running client-facing node.
@@ -1486,6 +1487,8 @@ class AgentRuntime:
graph_id: Optional graph to search first (defaults to active graph)
is_client_input: True when the message originates from a real
human user (e.g. /chat endpoint), False for external events.
image_content: Optional list of image content blocks (OpenAI
image_url format) to include alongside the text.
Returns:
True if input was delivered, False if no matching node found
@@ -1497,7 +1500,9 @@ class AgentRuntime:
target = graph_id or self._active_graph_id
if target in self._graphs:
for stream in self._graphs[target].streams.values():
if await stream.inject_input(node_id, content, is_client_input=is_client_input):
if await stream.inject_input(
node_id, content, is_client_input=is_client_input, image_content=image_content
):
return True
# Then search all other graphs
@@ -1505,7 +1510,9 @@ class AgentRuntime:
if gid == target:
continue
for stream in reg.streams.values():
if await stream.inject_input(node_id, content, is_client_input=is_client_input):
if await stream.inject_input(
node_id, content, is_client_input=is_client_input, image_content=image_content
):
return True
return False
+4 -1
View File
@@ -433,6 +433,7 @@ class ExecutionStream:
content: str,
*,
is_client_input: bool = False,
image_content: list[dict[str, Any]] | None = None,
) -> bool:
"""Inject user input into a running client-facing EventLoopNode.
@@ -444,7 +445,9 @@ class ExecutionStream:
for executor in self._active_executors.values():
node = executor.node_registry.get(node_id)
if node is not None and hasattr(node, "inject_event"):
await node.inject_event(content, is_client_input=is_client_input)
await node.inject_event(
content, is_client_input=is_client_input, image_content=image_content
)
return True
return False
+11 -4
View File
@@ -108,7 +108,10 @@ async def handle_chat(request: web.Request) -> web.Response:
The input box is permanently connected to the queen agent.
Worker input is handled separately via /worker-input.
Body: {"message": "hello"}
Body: {"message": "hello", "images": [{"type": "image_url", "image_url": {"url": "data:..."}}]}
The optional ``images`` field accepts a list of OpenAI-format image_url
content blocks. The frontend encodes images as base64 data URIs.
"""
session, err = resolve_session(request)
if err:
@@ -116,15 +119,16 @@ async def handle_chat(request: web.Request) -> web.Response:
body = await request.json()
message = body.get("message", "")
image_content = body.get("images") or None # list[dict] | None
if not message:
if not message and not image_content:
return web.json_response({"error": "message is required"}, status=400)
queen_executor = session.queen_executor
if queen_executor is not None:
node = queen_executor.node_registry.get("queen")
if node is not None and hasattr(node, "inject_event"):
await node.inject_event(message, is_client_input=True)
await node.inject_event(message, is_client_input=True, image_content=image_content)
# Publish to EventBus so the session event log captures user messages
from framework.runtime.event_bus import AgentEvent, EventType
@@ -134,7 +138,10 @@ async def handle_chat(request: web.Request) -> web.Response:
stream_id="queen",
node_id="queen",
execution_id=session.id,
data={"content": message},
data={
"content": message,
"image_count": len(image_content) if image_content else 0,
},
)
)
return web.json_response(
+30
View File
@@ -28,6 +28,8 @@ import contextlib
import json
import logging
import shutil
import subprocess
import sys
import time
from pathlib import Path
@@ -51,8 +53,11 @@ def _get_manager(request: web.Request) -> SessionManager:
def _session_to_live_dict(session) -> dict:
"""Serialize a live Session to the session-primary JSON shape."""
from framework.llm.capabilities import supports_image_tool_results
info = session.worker_info
phase_state = getattr(session, "phase_state", None)
queen_model: str = getattr(getattr(session, "runner", None), "model", "") or ""
return {
"session_id": session.id,
"worker_id": session.worker_id,
@@ -68,6 +73,7 @@ def _session_to_live_dict(session) -> dict:
"queen_phase": phase_state.phase
if phase_state
else ("staging" if session.worker_runtime else "planning"),
"queen_supports_images": supports_image_tool_results(queen_model) if queen_model else True,
}
@@ -978,6 +984,29 @@ async def handle_discover(request: web.Request) -> web.Response:
return web.json_response(result)
async def handle_reveal_session_folder(request: web.Request) -> web.Response:
"""POST /api/sessions/{session_id}/reveal — open session data folder in the OS file manager."""
manager: SessionManager = request.app["manager"]
session_id = request.match_info["session_id"]
session = manager.get_session(session_id)
storage_session_id = (session.queen_resume_from or session.id) if session else session_id
folder = Path.home() / ".hive" / "queen" / "session" / storage_session_id
folder.mkdir(parents=True, exist_ok=True)
try:
if sys.platform == "darwin":
subprocess.Popen(["open", str(folder)])
elif sys.platform == "win32":
subprocess.Popen(["explorer", str(folder)])
else:
subprocess.Popen(["xdg-open", str(folder)])
except Exception as exc:
return web.json_response({"error": str(exc)}, status=500)
return web.json_response({"path": str(folder)})
# ------------------------------------------------------------------
# Route registration
# ------------------------------------------------------------------
@@ -1002,6 +1031,7 @@ def register_routes(app: web.Application) -> None:
app.router.add_delete("/api/sessions/{session_id}/worker", handle_unload_worker)
# Session info
app.router.add_post("/api/sessions/{session_id}/reveal", handle_reveal_session_folder)
app.router.add_get("/api/sessions/{session_id}/stats", handle_session_stats)
app.router.add_get("/api/sessions/{session_id}/entry-points", handle_session_entry_points)
app.router.add_patch(
+2 -2
View File
@@ -34,8 +34,8 @@ export const executionApi = {
graph_id: graphId,
}),
chat: (sessionId: string, message: string) =>
api.post<ChatResult>(`/sessions/${sessionId}/chat`, { message }),
chat: (sessionId: string, message: string, images?: { type: string; image_url: { url: string } }[]) =>
api.post<ChatResult>(`/sessions/${sessionId}/chat`, { message, ...(images?.length ? { images } : {}) }),
/** Queue context for the queen without triggering an LLM response. */
queenContext: (sessionId: string, message: string) =>
+4
View File
@@ -81,6 +81,10 @@ export const sessionsApi = {
eventsHistory: (sessionId: string) =>
api.get<{ events: AgentEvent[]; session_id: string }>(`/sessions/${sessionId}/events/history`),
/** Open the session's data folder in the OS file manager. */
revealFolder: (sessionId: string) =>
api.post<{ path: string }>(`/sessions/${sessionId}/reveal`),
/** List all queen sessions on disk — live + cold (post-restart). */
history: () =>
api.get<{ sessions: Array<{ session_id: string; cold: boolean; live: boolean; has_messages: boolean; created_at: number; agent_name?: string | null; agent_path?: string | null }> }>("/sessions/history"),
+2
View File
@@ -14,6 +14,8 @@ export interface LiveSession {
intro_message?: string;
/** Queen operating phase — "planning", "building", "staging", or "running" */
queen_phase?: "planning" | "building" | "staging" | "running";
/** Whether the queen's LLM supports image content in messages */
queen_supports_images?: boolean;
/** Present in 409 conflict responses when worker is still loading */
loading?: boolean;
}
+276 -47
View File
@@ -1,5 +1,19 @@
import { memo, useState, useRef, useEffect, useMemo } from "react";
import { Send, Square, Crown, Cpu, Check, Loader2 } from "lucide-react";
import {
Send,
Square,
Crown,
Cpu,
Check,
Loader2,
Paperclip,
X,
} from "lucide-react";
export interface ImageContent {
type: "image_url";
image_url: { url: string };
}
export interface ContextUsageEntry {
usagePct: number;
@@ -10,7 +24,9 @@ export interface ContextUsageEntry {
import MarkdownContent from "@/components/MarkdownContent";
import QuestionWidget from "@/components/QuestionWidget";
import MultiQuestionWidget from "@/components/MultiQuestionWidget";
import ParallelSubagentBubble, { type SubagentGroup } from "@/components/ParallelSubagentBubble";
import ParallelSubagentBubble, {
type SubagentGroup,
} from "@/components/ParallelSubagentBubble";
export interface ChatMessage {
id: string;
@@ -18,7 +34,13 @@ export interface ChatMessage {
agentColor: string;
content: string;
timestamp: string;
type?: "system" | "agent" | "user" | "tool_status" | "worker_input_request" | "run_divider";
type?:
| "system"
| "agent"
| "user"
| "tool_status"
| "worker_input_request"
| "run_divider";
role?: "queen" | "worker";
/** Which worker thread this message belongs to (worker agent name) */
thread?: string;
@@ -26,6 +48,8 @@ export interface ChatMessage {
createdAt?: number;
/** Queen phase active when this message was created */
phase?: "planning" | "building" | "staging" | "running";
/** Images attached to a user message */
images?: ImageContent[];
/** Backend node_id that produced this message — used for subagent grouping */
nodeId?: string;
/** Backend execution_id for this message */
@@ -34,7 +58,7 @@ export interface ChatMessage {
interface ChatPanelProps {
messages: ChatMessage[];
onSend: (message: string, thread: string) => void;
onSend: (message: string, thread: string, images?: ImageContent[]) => void;
isWaiting?: boolean;
/** When true a worker is thinking (not yet streaming) */
isWorkerWaiting?: boolean;
@@ -43,6 +67,8 @@ interface ChatPanelProps {
activeThread: string;
/** When true, the input is disabled (e.g. during loading) */
disabled?: boolean;
/** When false, the image attach button is hidden (model lacks vision support) */
supportsImages?: boolean;
/** Called when user clicks the stop button to cancel the queen's current turn */
onCancel?: () => void;
/** Pending question from ask_user — replaces textarea when present */
@@ -50,7 +76,9 @@ interface ChatPanelProps {
/** Options for the pending question */
pendingOptions?: string[] | null;
/** Multiple questions from ask_user_multiple */
pendingQuestions?: { id: string; prompt: string; options?: string[] }[] | null;
pendingQuestions?:
| { id: string; prompt: string; options?: string[] }[]
| null;
/** Called when user submits an answer to the pending question */
onQuestionSubmit?: (answer: string, isOther: boolean) => void;
/** Called when user submits answers to multiple questions */
@@ -86,7 +114,8 @@ const TOOL_HEX = [
function toolHex(name: string): string {
let hash = 0;
for (let i = 0; i < name.length; i++) hash = (hash * 31 + name.charCodeAt(i)) | 0;
for (let i = 0; i < name.length; i++)
hash = (hash * 31 + name.charCodeAt(i)) | 0;
return TOOL_HEX[Math.abs(hash) % TOOL_HEX.length];
}
@@ -134,12 +163,18 @@ function ToolActivityRow({ content }: { content: string }) {
<span
key={`run-${p.name}`}
className="inline-flex items-center gap-1 text-[11px] px-2.5 py-0.5 rounded-full"
style={{ color: hex, backgroundColor: `${hex}18`, border: `1px solid ${hex}35` }}
style={{
color: hex,
backgroundColor: `${hex}18`,
border: `1px solid ${hex}35`,
}}
>
<Loader2 className="w-2.5 h-2.5 animate-spin" />
{p.name}
{p.count > 1 && (
<span className="text-[10px] font-medium opacity-70">×{p.count}</span>
<span className="text-[10px] font-medium opacity-70">
×{p.count}
</span>
)}
</span>
);
@@ -150,7 +185,11 @@ function ToolActivityRow({ content }: { content: string }) {
<span
key={`done-${p.name}`}
className="inline-flex items-center gap-1 text-[11px] px-2.5 py-0.5 rounded-full"
style={{ color: hex, backgroundColor: `${hex}18`, border: `1px solid ${hex}35` }}
style={{
color: hex,
backgroundColor: `${hex}18`,
border: `1px solid ${hex}35`,
}}
>
<Check className="w-2.5 h-2.5" />
{p.name}
@@ -165,7 +204,14 @@ function ToolActivityRow({ content }: { content: string }) {
);
}
const MessageBubble = memo(function MessageBubble({ msg, queenPhase }: { msg: ChatMessage; queenPhase?: "planning" | "building" | "staging" | "running" }) {
const MessageBubble = memo(
function MessageBubble({
msg,
queenPhase,
}: {
msg: ChatMessage;
queenPhase?: "planning" | "building" | "staging" | "running";
}) {
const isUser = msg.type === "user";
const isQueen = msg.role === "queen";
const color = getColor(msg.agent, msg.role);
@@ -200,7 +246,21 @@ const MessageBubble = memo(function MessageBubble({ msg, queenPhase }: { msg: Ch
return (
<div className="flex justify-end">
<div className="max-w-[75%] bg-primary text-primary-foreground text-sm leading-relaxed rounded-2xl rounded-br-md px-4 py-3">
{msg.images && msg.images.length > 0 && (
<div className="flex flex-wrap gap-2 mb-2">
{msg.images.map((img, i) => (
<img
key={i}
src={img.image_url.url}
alt={`attachment ${i + 1}`}
className="max-h-48 max-w-full rounded-lg object-contain"
/>
))}
</div>
)}
{msg.content && (
<p className="whitespace-pre-wrap break-words">{msg.content}</p>
)}
</div>
</div>
);
@@ -222,24 +282,31 @@ const MessageBubble = memo(function MessageBubble({ msg, queenPhase }: { msg: Ch
<Cpu className="w-3.5 h-3.5" style={{ color }} />
)}
</div>
<div className={`flex-1 min-w-0 ${isQueen ? "max-w-[85%]" : "max-w-[75%]"}`}>
<div
className={`flex-1 min-w-0 ${isQueen ? "max-w-[85%]" : "max-w-[75%]"}`}
>
<div className="flex items-center gap-2 mb-1">
<span className={`font-medium ${isQueen ? "text-sm" : "text-xs"}`} style={{ color }}>
<span
className={`font-medium ${isQueen ? "text-sm" : "text-xs"}`}
style={{ color }}
>
{msg.agent}
</span>
<span
className={`text-[10px] font-medium px-1.5 py-0.5 rounded-md ${
isQueen ? "bg-primary/15 text-primary" : "bg-muted text-muted-foreground"
isQueen
? "bg-primary/15 text-primary"
: "bg-muted text-muted-foreground"
}`}
>
{isQueen
? ((msg.phase ?? queenPhase) === "running"
? (msg.phase ?? queenPhase) === "running"
? "running"
: (msg.phase ?? queenPhase) === "staging"
? "staging"
: (msg.phase ?? queenPhase) === "planning"
? "planning"
: "building")
: "building"
: "Worker"}
</span>
</div>
@@ -253,15 +320,41 @@ const MessageBubble = memo(function MessageBubble({ msg, queenPhase }: { msg: Ch
</div>
</div>
);
}, (prev, next) => prev.msg.id === next.msg.id && prev.msg.content === next.msg.content && prev.msg.phase === next.msg.phase && prev.queenPhase === next.queenPhase);
},
(prev, next) =>
prev.msg.id === next.msg.id &&
prev.msg.content === next.msg.content &&
prev.msg.phase === next.msg.phase &&
prev.queenPhase === next.queenPhase,
);
export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting, isBusy, activeThread, disabled, onCancel, pendingQuestion, pendingOptions, pendingQuestions, onQuestionSubmit, onMultiQuestionSubmit, onQuestionDismiss, queenPhase, contextUsage }: ChatPanelProps) {
export default function ChatPanel({
messages,
onSend,
isWaiting,
isWorkerWaiting,
isBusy,
activeThread,
disabled,
onCancel,
pendingQuestion,
pendingOptions,
pendingQuestions,
onQuestionSubmit,
onMultiQuestionSubmit,
onQuestionDismiss,
queenPhase,
contextUsage,
supportsImages = true,
}: ChatPanelProps) {
const [input, setInput] = useState("");
const [pendingImages, setPendingImages] = useState<ImageContent[]>([]);
const [readMap, setReadMap] = useState<Record<string, number>>({});
const bottomRef = useRef<HTMLDivElement>(null);
const scrollRef = useRef<HTMLDivElement>(null);
const stickToBottom = useRef(true);
const textareaRef = useRef<HTMLTextAreaElement>(null);
const fileInputRef = useRef<HTMLInputElement>(null);
const threadMessages = messages.filter((m) => {
if (m.type === "system" && !m.thread) return false;
@@ -270,7 +363,8 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
// tool-use-only turns that have no visible text. During live operation
// tool pills provide context, but on resume the pills are gone so
// the empty bubble is meaningless.
if (m.role === "queen" && !m.type && (!m.content || !m.content.trim())) return false;
if (m.role === "queen" && !m.type && (!m.content || !m.content.trim()))
return false;
return true;
});
@@ -317,7 +411,8 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
// Worker message from a non-subagent node means the graph has
// moved on to the next stage. Close the bubble even if some
// subagents are still streaming in the background.
if (m.role === "worker" && m.nodeId && !m.nodeId.includes(":subagent:")) break;
if (m.role === "worker" && m.nodeId && !m.nodeId.includes(":subagent:"))
break;
// Soft interruption (queen output, system, tool_status without
// nodeId) — render it normally but keep the subagent run going
@@ -382,31 +477,63 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
const handleSubmit = (e: React.FormEvent) => {
e.preventDefault();
if (!input.trim()) return;
onSend(input.trim(), activeThread);
if (!input.trim() && pendingImages.length === 0) return;
onSend(
input.trim(),
activeThread,
pendingImages.length > 0 ? pendingImages : undefined,
);
setInput("");
setPendingImages([]);
if (textareaRef.current) textareaRef.current.style.height = "auto";
};
const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
const files = Array.from(e.target.files ?? []);
if (files.length === 0) return;
files.forEach((file) => {
const reader = new FileReader();
reader.onload = (ev) => {
const url = ev.target?.result as string;
setPendingImages((prev) => [
...prev,
{ type: "image_url", image_url: { url } },
]);
};
reader.readAsDataURL(file);
});
// Reset so the same file can be re-selected
e.target.value = "";
};
return (
<div className="flex flex-col h-full min-w-0">
{/* Compact sub-header */}
<div className="px-5 pt-4 pb-2 flex items-center gap-2">
<p className="text-[11px] text-muted-foreground font-medium uppercase tracking-wider">Conversation</p>
<p className="text-[11px] text-muted-foreground font-medium uppercase tracking-wider">
Conversation
</p>
</div>
{/* Messages */}
<div ref={scrollRef} onScroll={handleScroll} className="flex-1 overflow-auto px-5 py-4 space-y-3">
<div
ref={scrollRef}
onScroll={handleScroll}
className="flex-1 overflow-auto px-5 py-4 space-y-3"
>
{renderItems.map((item) =>
item.kind === "parallel" ? (
<div key={item.groupId}>
<ParallelSubagentBubble groupId={item.groupId} groups={item.groups} />
<ParallelSubagentBubble
groupId={item.groupId}
groups={item.groups}
/>
</div>
) : (
<div key={item.msg.id}>
<MessageBubble msg={item.msg} queenPhase={queenPhase} />
</div>
)
),
)}
{/* Show typing indicator while waiting for first queen response (disabled + empty chat) */}
@@ -424,9 +551,18 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
</div>
<div className="border border-primary/20 bg-primary/5 rounded-2xl rounded-tl-md px-4 py-3">
<div className="flex gap-1.5">
<span className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce" style={{ animationDelay: "0ms" }} />
<span className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce" style={{ animationDelay: "150ms" }} />
<span className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce" style={{ animationDelay: "300ms" }} />
<span
className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce"
style={{ animationDelay: "0ms" }}
/>
<span
className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce"
style={{ animationDelay: "150ms" }}
/>
<span
className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce"
style={{ animationDelay: "300ms" }}
/>
</div>
</div>
</div>
@@ -444,9 +580,18 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
</div>
<div className="bg-muted/60 rounded-2xl rounded-tl-md px-4 py-3">
<div className="flex gap-1.5">
<span className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce" style={{ animationDelay: "0ms" }} />
<span className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce" style={{ animationDelay: "150ms" }} />
<span className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce" style={{ animationDelay: "300ms" }} />
<span
className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce"
style={{ animationDelay: "0ms" }}
/>
<span
className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce"
style={{ animationDelay: "150ms" }}
/>
<span
className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce"
style={{ animationDelay: "300ms" }}
/>
</div>
</div>
</div>
@@ -458,46 +603,84 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
{(() => {
if (!contextUsage) return null;
const queenUsage = contextUsage["__queen__"];
const workerEntries = Object.entries(contextUsage).filter(([k]) => k !== "__queen__");
const workerUsage = workerEntries.length > 0
? workerEntries.reduce((best, [, v]) => (v.usagePct > best.usagePct ? v : best), workerEntries[0][1])
const workerEntries = Object.entries(contextUsage).filter(
([k]) => k !== "__queen__",
);
const workerUsage =
workerEntries.length > 0
? workerEntries.reduce(
(best, [, v]) => (v.usagePct > best.usagePct ? v : best),
workerEntries[0][1],
)
: undefined;
if (!queenUsage && !workerUsage) return null;
return (
<div className="flex items-center gap-3 mx-4 px-3 py-1 rounded-lg bg-muted/30 border border-border/20 group/ctx flex-shrink-0">
{queenUsage && (
<div className="flex items-center gap-2 flex-1 min-w-0" title={`Queen: ${(queenUsage.estimatedTokens / 1000).toFixed(1)}k / ${(queenUsage.maxTokens / 1000).toFixed(0)}k tokens \u00b7 ${queenUsage.messageCount} messages`}>
<Crown className="w-3 h-3 flex-shrink-0" style={{ color: "hsl(45,95%,58%)" }} />
<div
className="flex items-center gap-2 flex-1 min-w-0"
title={`Queen: ${(queenUsage.estimatedTokens / 1000).toFixed(1)}k / ${(queenUsage.maxTokens / 1000).toFixed(0)}k tokens \u00b7 ${queenUsage.messageCount} messages`}
>
<Crown
className="w-3 h-3 flex-shrink-0"
style={{ color: "hsl(45,95%,58%)" }}
/>
<div className="flex-1 h-1.5 rounded-full bg-muted/50 overflow-hidden min-w-[60px]">
<div
className="h-full rounded-full transition-all duration-500 ease-out"
style={{
width: `${Math.min(queenUsage.usagePct, 100)}%`,
backgroundColor: queenUsage.usagePct >= 90 ? "hsl(0,65%,55%)" : queenUsage.usagePct >= 70 ? "hsl(35,90%,55%)" : "hsl(45,95%,58%)",
backgroundColor:
queenUsage.usagePct >= 90
? "hsl(0,65%,55%)"
: queenUsage.usagePct >= 70
? "hsl(35,90%,55%)"
: "hsl(45,95%,58%)",
}}
/>
</div>
<span className="text-[10px] text-muted-foreground/70 flex-shrink-0 tabular-nums">
<span className="group-hover/ctx:hidden">{queenUsage.usagePct}%</span>
<span className="hidden group-hover/ctx:inline">{(queenUsage.estimatedTokens / 1000).toFixed(1)}k / {(queenUsage.maxTokens / 1000).toFixed(0)}k</span>
<span className="group-hover/ctx:hidden">
{queenUsage.usagePct}%
</span>
<span className="hidden group-hover/ctx:inline">
{(queenUsage.estimatedTokens / 1000).toFixed(1)}k /{" "}
{(queenUsage.maxTokens / 1000).toFixed(0)}k
</span>
</span>
</div>
)}
{workerUsage && (
<div className="flex items-center gap-2 flex-1 min-w-0" title={`Worker: ${(workerUsage.estimatedTokens / 1000).toFixed(1)}k / ${(workerUsage.maxTokens / 1000).toFixed(0)}k tokens \u00b7 ${workerUsage.messageCount} messages`}>
<Cpu className="w-3 h-3 flex-shrink-0" style={{ color: "hsl(220,60%,55%)" }} />
<div
className="flex items-center gap-2 flex-1 min-w-0"
title={`Worker: ${(workerUsage.estimatedTokens / 1000).toFixed(1)}k / ${(workerUsage.maxTokens / 1000).toFixed(0)}k tokens \u00b7 ${workerUsage.messageCount} messages`}
>
<Cpu
className="w-3 h-3 flex-shrink-0"
style={{ color: "hsl(220,60%,55%)" }}
/>
<div className="flex-1 h-1.5 rounded-full bg-muted/50 overflow-hidden min-w-[60px]">
<div
className="h-full rounded-full transition-all duration-500 ease-out"
style={{
width: `${Math.min(workerUsage.usagePct, 100)}%`,
backgroundColor: workerUsage.usagePct >= 90 ? "hsl(0,65%,55%)" : workerUsage.usagePct >= 70 ? "hsl(35,90%,55%)" : "hsl(220,60%,55%)",
backgroundColor:
workerUsage.usagePct >= 90
? "hsl(0,65%,55%)"
: workerUsage.usagePct >= 70
? "hsl(35,90%,55%)"
: "hsl(220,60%,55%)",
}}
/>
</div>
<span className="text-[10px] text-muted-foreground/70 flex-shrink-0 tabular-nums">
<span className="group-hover/ctx:hidden">{workerUsage.usagePct}%</span>
<span className="hidden group-hover/ctx:inline">{(workerUsage.estimatedTokens / 1000).toFixed(1)}k / {(workerUsage.maxTokens / 1000).toFixed(0)}k</span>
<span className="group-hover/ctx:hidden">
{workerUsage.usagePct}%
</span>
<span className="hidden group-hover/ctx:inline">
{(workerUsage.estimatedTokens / 1000).toFixed(1)}k /{" "}
{(workerUsage.maxTokens / 1000).toFixed(0)}k
</span>
</span>
</div>
)}
@@ -506,7 +689,9 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
})()}
{/* Input area — question widget replaces textarea when a question is pending */}
{pendingQuestions && pendingQuestions.length >= 2 && onMultiQuestionSubmit ? (
{pendingQuestions &&
pendingQuestions.length >= 2 &&
onMultiQuestionSubmit ? (
<MultiQuestionWidget
questions={pendingQuestions}
onSubmit={onMultiQuestionSubmit}
@@ -521,7 +706,47 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
/>
) : (
<form onSubmit={handleSubmit} className="p-4">
{/* Image preview strip */}
{pendingImages.length > 0 && (
<div className="flex flex-wrap gap-2 mb-2 px-1">
{pendingImages.map((img, i) => (
<div key={i} className="relative group">
<img
src={img.image_url.url}
alt={`preview ${i + 1}`}
className="h-16 w-16 object-cover rounded-lg border border-border"
/>
<button
type="button"
onClick={() =>
setPendingImages((prev) => prev.filter((_, j) => j !== i))
}
className="absolute -top-1.5 -right-1.5 w-4 h-4 rounded-full bg-destructive text-destructive-foreground flex items-center justify-center opacity-0 group-hover:opacity-100 transition-opacity"
>
<X className="w-2.5 h-2.5" />
</button>
</div>
))}
</div>
)}
<div className="flex items-center gap-3 bg-muted/40 rounded-xl px-4 py-2.5 border border-border focus-within:border-primary/40 transition-colors">
<input
ref={fileInputRef}
type="file"
accept="image/*"
multiple
className="hidden"
onChange={handleFileChange}
/>
<button
type="button"
disabled={disabled || !supportsImages}
onClick={() => supportsImages && fileInputRef.current?.click()}
className="flex-shrink-0 p-1 rounded-md text-muted-foreground hover:text-foreground disabled:opacity-30 transition-colors"
title={supportsImages ? "Attach image" : "Image not supported by the current model"}
>
<Paperclip className="w-4 h-4" />
</button>
<textarea
ref={textareaRef}
rows={1}
@@ -538,7 +763,9 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
handleSubmit(e);
}
}}
placeholder={disabled ? "Connecting to agent..." : "Message Queen Bee..."}
placeholder={
disabled ? "Connecting to agent..." : "Message Queen Bee..."
}
disabled={disabled}
className="flex-1 bg-transparent text-sm text-foreground outline-none placeholder:text-muted-foreground disabled:opacity-50 disabled:cursor-not-allowed resize-none overflow-y-auto"
/>
@@ -553,7 +780,9 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
) : (
<button
type="submit"
disabled={!input.trim() || disabled}
disabled={
(!input.trim() && pendingImages.length === 0) || disabled
}
className="p-2 rounded-lg bg-primary text-primary-foreground disabled:opacity-30 hover:opacity-90 transition-opacity"
>
<Send className="w-4 h-4" />
+20 -3
View File
@@ -1,7 +1,7 @@
import { useState, useCallback, useRef, useEffect, useMemo } from "react";
import ReactDOM from "react-dom";
import { useSearchParams, useNavigate } from "react-router-dom";
import { Plus, KeyRound, Sparkles, Layers, ChevronLeft, Bot, Loader2, WifiOff, X } from "lucide-react";
import { Plus, KeyRound, Sparkles, Layers, ChevronLeft, Bot, Loader2, WifiOff, X, FolderOpen } from "lucide-react";
import type { GraphNode, NodeStatus } from "@/components/graph-types";
import DraftGraph from "@/components/DraftGraph";
import ChatPanel, { type ChatMessage } from "@/components/ChatPanel";
@@ -354,6 +354,8 @@ interface AgentBackendState {
pendingQuestionSource: "queen" | "worker" | null;
/** Per-node context window usage (from context_usage_updated events) */
contextUsage: Record<string, { usagePct: number; messageCount: number; estimatedTokens: number; maxTokens: number }>;
/** Whether the queen's LLM supports image content — false disables the attach button */
queenSupportsImages: boolean;
}
function defaultAgentState(): AgentBackendState {
@@ -392,6 +394,7 @@ function defaultAgentState(): AgentBackendState {
pendingQuestions: null,
pendingQuestionSource: null,
contextUsage: {},
queenSupportsImages: true,
};
}
@@ -923,6 +926,7 @@ export default function Workspace() {
queenReady: true,
queenPhase: qPhase,
queenBuilding: qPhase === "building",
queenSupportsImages: liveSession.queen_supports_images !== false,
// Restore flowchart overlay from persisted events
...(restoredFlowchartMap ? { flowchartMap: restoredFlowchartMap } : {}),
...(restoredOriginalDraft ? { originalDraft: restoredOriginalDraft, draftGraph: null } : {}),
@@ -1122,6 +1126,7 @@ export default function Workspace() {
displayName,
queenPhase: initialPhase,
queenBuilding: initialPhase === "building",
queenSupportsImages: session.queen_supports_images !== false,
// Restore flowchart overlay from persisted events
...(restoredFlowchartMap ? { flowchartMap: restoredFlowchartMap } : {}),
...(restoredOriginalDraft ? { originalDraft: restoredOriginalDraft, draftGraph: null } : {}),
@@ -2613,7 +2618,7 @@ export default function Workspace() {
});
// --- handleSend ---
const handleSend = useCallback((text: string, thread: string) => {
const handleSend = useCallback((text: string, thread: string, images?: import("@/components/ChatPanel").ImageContent[]) => {
if (!activeSession) return;
const state = agentStates[activeWorker];
@@ -2679,6 +2684,7 @@ export default function Workspace() {
const userMsg: ChatMessage = {
id: makeId(), agent: "You", agentColor: "",
content: text, timestamp: "", type: "user", thread, createdAt: Date.now(),
images,
};
setSessionsByAgent(prev => ({
...prev,
@@ -2690,7 +2696,7 @@ export default function Workspace() {
updateAgentState(activeWorker, { isTyping: true, queenIsTyping: true });
if (state?.sessionId && state?.ready) {
executionApi.chat(state.sessionId, text).catch((err: unknown) => {
executionApi.chat(state.sessionId, text, images).catch((err: unknown) => {
const errMsg = err instanceof Error ? err.message : String(err);
const errorChatMsg: ChatMessage = {
id: makeId(), agent: "System", agentColor: "",
@@ -3106,6 +3112,16 @@ export default function Workspace() {
<KeyRound className="w-3.5 h-3.5" />
Credentials
</button>
{activeAgentState?.sessionId && (
<button
onClick={() => sessionsApi.revealFolder(activeAgentState.sessionId!).catch(() => {})}
className="flex items-center gap-1.5 px-3 py-1.5 rounded-md text-xs font-medium text-muted-foreground hover:text-foreground hover:bg-muted/50 transition-colors flex-shrink-0"
title="Open session data folder"
>
<FolderOpen className="w-3.5 h-3.5" />
Data
</button>
)}
</TopBar>
{/* Main content area */}
@@ -3224,6 +3240,7 @@ export default function Workspace() {
onMultiQuestionSubmit={handleMultiQuestionAnswer}
onQuestionDismiss={handleQuestionDismiss}
contextUsage={activeAgentState?.contextUsage}
supportsImages={activeAgentState?.queenSupportsImages ?? true}
/>
)}
</div>
+58
View File
@@ -0,0 +1,58 @@
"""Tests for LLM model capability checks."""
from __future__ import annotations
import pytest
from framework.llm.capabilities import supports_image_tool_results
class TestSupportsImageToolResults:
"""Verify the deny-list correctly identifies models that can't handle images."""
@pytest.mark.parametrize(
"model",
[
"gpt-4o",
"gpt-4o-mini",
"gpt-4-turbo",
"openai/gpt-4o",
"anthropic/claude-sonnet-4-20250514",
"claude-haiku-4-5-20251001",
"gemini/gemini-1.5-pro",
"google/gemini-1.5-flash",
"mistral/mistral-large",
"groq/llama3-70b",
"together/meta-llama/Llama-3-70b",
"fireworks_ai/llama-v3-70b",
"azure/gpt-4o",
"kimi/claude-sonnet-4-20250514",
"hive/claude-sonnet-4-20250514",
],
)
def test_supported_models(self, model: str):
assert supports_image_tool_results(model) is True
@pytest.mark.parametrize(
"model",
[
"deepseek/deepseek-chat",
"deepseek/deepseek-coder",
"deepseek-chat",
"deepseek-reasoner",
"ollama/llama3",
"ollama/mistral",
"ollama_chat/llama3",
"lm_studio/my-model",
"vllm/meta-llama/Llama-3-70b",
"llamacpp/model",
"cerebras/llama3-70b",
],
)
def test_unsupported_models(self, model: str):
assert supports_image_tool_results(model) is False
def test_case_insensitive(self):
assert supports_image_tool_results("DeepSeek/deepseek-chat") is False
assert supports_image_tool_results("OLLAMA/llama3") is False
assert supports_image_tool_results("GPT-4o") is True
+3
View File
@@ -48,6 +48,9 @@ dev = [
sandbox = [
"RestrictedPython>=7.0",
]
browser = [
"pillow>=10.0.0",
]
ocr = [
"pytesseract>=0.3.10",
"pillow>=10.0.0",
+192
View File
@@ -0,0 +1,192 @@
"""Ref system for aria snapshots.
Assigns short `[ref=eN]` markers to interactive elements in Playwright's
aria_snapshot() output so the LLM can reference elements by ref instead of
constructing fragile CSS selectors.
Usage:
annotated, ref_map = annotate_snapshot(raw_snapshot)
# ... later, when the LLM says selector="e5" ...
playwright_selector = resolve_ref("e5", ref_map)
"""
from __future__ import annotations
import re
from dataclasses import dataclass
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from .session import BrowserSession
# ---------------------------------------------------------------------------
# Role sets (matching Playwright's aria roles that matter for interaction)
# ---------------------------------------------------------------------------
INTERACTIVE_ROLES: frozenset[str] = frozenset(
{
"button",
"checkbox",
"combobox",
"link",
"listbox",
"menuitem",
"menuitemcheckbox",
"menuitemradio",
"option",
"radio",
"scrollbar",
"searchbox",
"slider",
"spinbutton",
"switch",
"tab",
"textbox",
"treeitem",
}
)
NAMED_CONTENT_ROLES: frozenset[str] = frozenset(
{
"cell",
"heading",
"img",
}
)
# Regex: captures indent, role, optional quoted name, and trailing text.
# Example line: " - button \"Submit\" [disabled]"
# group(1)=indent " ", group(2)=role "button",
# group(3)=name "Submit" (or None), group(4)=rest " [disabled]"
_LINE_RE = re.compile(r"^(\s*-\s+)(\w+)(?:\s+\"([^\"]*)\")?(.*?)$")
# ---------------------------------------------------------------------------
# Data types
# ---------------------------------------------------------------------------
@dataclass(frozen=True)
class RefEntry:
"""A single ref entry mapping to a Playwright role selector."""
role: str
name: str | None
nth: int
# ref_id (e.g. "e0") -> RefEntry
RefMap = dict[str, RefEntry]
# ---------------------------------------------------------------------------
# annotate_snapshot
# ---------------------------------------------------------------------------
def annotate_snapshot(snapshot: str) -> tuple[str, RefMap]:
"""Inject ``[ref=eN]`` markers into an aria snapshot.
Returns:
(annotated_text, ref_map) where ref_map maps ref ids to RefEntry.
"""
lines = snapshot.split("\n")
# First pass: identify which lines get refs and count (role, name) pairs
# for nth disambiguation.
candidates: list[tuple[int, str, str | None]] = [] # (line_idx, role, name)
for i, line in enumerate(lines):
m = _LINE_RE.match(line)
if not m:
continue
role = m.group(2)
name = m.group(3) # None if no quoted name
if role in INTERACTIVE_ROLES or (role in NAMED_CONTENT_ROLES and name):
candidates.append((i, role, name))
# Second pass: assign refs with nth indices.
ref_map: RefMap = {}
pair_seen: dict[tuple[str, str | None], int] = {}
ref_counter = 0
for line_idx, role, name in candidates:
key = (role, name)
nth = pair_seen.get(key, 0)
pair_seen[key] = nth + 1
ref_id = f"e{ref_counter}"
ref_counter += 1
ref_map[ref_id] = RefEntry(role=role, name=name, nth=nth)
# Inject [ref=eN] at end of line (before any trailing whitespace)
lines[line_idx] = lines[line_idx].rstrip() + f" [ref={ref_id}]"
return "\n".join(lines), ref_map
# ---------------------------------------------------------------------------
# resolve_ref
# ---------------------------------------------------------------------------
_REF_PATTERN = re.compile(r"^e\d+$")
def resolve_ref(selector: str, ref_map: RefMap | None) -> str:
"""Resolve a ref id (e.g. ``"e5"``) to a Playwright role selector.
If *selector* doesn't look like a ref (``e\\d+``), it's returned as-is
so that plain CSS selectors keep working.
Raises:
ValueError: If the ref is not found or no snapshot has been taken.
"""
if not _REF_PATTERN.match(selector):
return selector # Pass through CSS / XPath / role selectors
if ref_map is None:
raise ValueError(
f"Ref '{selector}' used but no snapshot has been taken yet. "
"Call browser_snapshot first."
)
entry = ref_map.get(selector)
if entry is None:
valid = ", ".join(sorted(ref_map.keys(), key=lambda k: int(k[1:])))
raise ValueError(
f"Ref '{selector}' not found. Valid refs: {valid}. "
"The page may have changed — take a new snapshot."
)
# Build Playwright role selector
if entry.name is not None:
escaped_name = entry.name.replace("\\", "\\\\").replace('"', '\\"')
sel = f'role={entry.role}[name="{escaped_name}"]'
else:
sel = f"role={entry.role}"
# Always include nth to disambiguate
sel += f" >> nth={entry.nth}"
return sel
# ---------------------------------------------------------------------------
# Convenience wrapper
# ---------------------------------------------------------------------------
def resolve_selector(
selector: str,
session: BrowserSession,
target_id: str | None,
) -> str:
"""Resolve a selector that might be a ref, using the session's ref maps.
Args:
selector: A CSS selector or ref id (e.g. ``"e5"``).
session: The current BrowserSession.
target_id: The target page id (falls back to session.active_page_id).
"""
tid = target_id or session.active_page_id
ref_map = session.ref_maps.get(tid) if tid else None
return resolve_ref(selector, ref_map)
+4
View File
@@ -353,6 +353,7 @@ class BrowserSession:
active_page_id: str | None = None
console_messages: dict[str, list[dict]] = field(default_factory=dict)
page_meta: dict[str, TabMeta] = field(default_factory=dict)
ref_maps: dict[str, dict] = field(default_factory=dict) # target_id → RefMap
_playwright: Any = None
_lock: asyncio.Lock = field(default_factory=asyncio.Lock)
@@ -447,6 +448,7 @@ class BrowserSession:
self.active_page_id = None
self.console_messages.clear()
self.page_meta.clear()
self.ref_maps.clear()
async def start(self, headless: bool = True, persistent: bool = True) -> dict:
"""
@@ -623,6 +625,7 @@ class BrowserSession:
self.active_page_id = None
self.console_messages.clear()
self.page_meta.clear()
self.ref_maps.clear()
self.user_data_dir = None
self.persistent = False
@@ -801,6 +804,7 @@ class BrowserSession:
self.pages.pop(target_id, None)
self.console_messages.pop(target_id, None)
self.page_meta.pop(target_id, None)
self.ref_maps.pop(target_id, None)
if self.active_page_id == target_id:
self.active_page_id = next(iter(self.pages), None)
+20
View File
@@ -16,6 +16,7 @@ from playwright.async_api import (
)
from ..highlight import highlight_element
from ..refs import resolve_selector
from ..session import DEFAULT_TIMEOUT_MS, get_session
@@ -52,6 +53,10 @@ def register_advanced_tools(mcp: FastMCP) -> None:
return {"ok": False, "error": "No active tab"}
if selector:
try:
selector = resolve_selector(selector, session, target_id)
except ValueError as e:
return {"ok": False, "error": str(e)}
await page.wait_for_selector(selector, timeout=timeout_ms)
return {"ok": True, "action": "wait", "condition": "selector", "selector": selector}
elif text:
@@ -122,6 +127,11 @@ def register_advanced_tools(mcp: FastMCP) -> None:
if not page:
return {"ok": False, "error": "No active tab"}
try:
selector = resolve_selector(selector, session, target_id)
except ValueError as e:
return {"ok": False, "error": str(e)}
element = await page.wait_for_selector(selector, timeout=timeout_ms)
if not element:
return {"ok": False, "error": f"Element not found: {selector}"}
@@ -160,6 +170,11 @@ def register_advanced_tools(mcp: FastMCP) -> None:
if not page:
return {"ok": False, "error": "No active tab"}
try:
selector = resolve_selector(selector, session, target_id)
except ValueError as e:
return {"ok": False, "error": str(e)}
element = await page.wait_for_selector(selector, timeout=timeout_ms)
if not element:
return {"ok": False, "error": f"Element not found: {selector}"}
@@ -238,6 +253,11 @@ def register_advanced_tools(mcp: FastMCP) -> None:
if not Path(path).exists():
return {"ok": False, "error": f"File not found: {path}"}
try:
selector = resolve_selector(selector, session, target_id)
except ValueError as e:
return {"ok": False, "error": str(e)}
await highlight_element(page, selector)
element = await page.wait_for_selector(selector, timeout=timeout_ms)
+147 -9
View File
@@ -7,14 +7,113 @@ Tools for extracting content and capturing page state.
from __future__ import annotations
import base64
import io
import json
import logging
from pathlib import Path
from typing import Any, Literal
from fastmcp import FastMCP
from mcp.types import ImageContent, TextContent
from playwright.async_api import Error as PlaywrightError
from ..session import get_session
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Screenshot normalization
# ---------------------------------------------------------------------------
_QUALITY_STEPS = (85, 70, 50)
_MIN_DIMENSION = 400
_DIMENSION_STEP = 200
def _normalize_screenshot(
raw_bytes: bytes,
image_type: str,
*,
max_dimension: int = 2000,
max_bytes: int = 5_000_000,
) -> tuple[bytes, str]:
"""Normalize a screenshot to fit within size and dimension limits.
Progressively resizes and compresses to JPEG until the image fits
under *max_bytes* and *max_dimension*. If Pillow is not installed
the original bytes are returned unchanged.
Args:
raw_bytes: Raw PNG or JPEG image bytes from Playwright.
image_type: Original format (``"png"`` or ``"jpeg"``).
max_dimension: Maximum width or height in pixels.
max_bytes: Maximum file size in bytes.
Returns:
``(normalized_bytes, image_type)`` where *image_type* may change
to ``"jpeg"`` if compression was applied.
"""
try:
from PIL import Image
except ImportError:
logger.debug("Pillow not installed — skipping screenshot normalization")
return raw_bytes, image_type
try:
img = Image.open(io.BytesIO(raw_bytes))
width, height = img.size
max_dim = max(width, height)
# Already within limits — return as-is
if len(raw_bytes) <= max_bytes and max_dim <= max_dimension:
return raw_bytes, image_type
# Build candidate dimensions (descending), skip anything >= original
candidates = [
d for d in range(max_dimension, _MIN_DIMENSION - 1, -_DIMENSION_STEP) if d < max_dim
]
# If the original is already <= max_dimension but over max_bytes,
# still try compressing at original size first.
if max_dim <= max_dimension:
candidates = [max_dim] + candidates
smallest: tuple[bytes, int] | None = None
for side in candidates:
# Re-open from source each iteration (thumbnail is destructive)
img = Image.open(io.BytesIO(raw_bytes))
img.thumbnail((side, side), Image.LANCZOS)
# JPEG doesn't support alpha
if img.mode in ("RGBA", "LA", "P"):
img = img.convert("RGB")
for quality in _QUALITY_STEPS:
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=quality, optimize=True)
out_bytes = buf.getvalue()
if smallest is None or len(out_bytes) < smallest[1]:
smallest = (out_bytes, len(out_bytes))
if len(out_bytes) <= max_bytes:
return out_bytes, "jpeg"
# Nothing fit — return the smallest we produced
if smallest is not None:
logger.warning(
"Screenshot normalization: could not fit under %d bytes (best: %d bytes)",
max_bytes,
smallest[1],
)
return smallest[0], "jpeg"
return raw_bytes, image_type
except Exception:
logger.warning("Screenshot normalization failed — returning original", exc_info=True)
return raw_bytes, image_type
def _format_ax_tree(nodes: list[dict[str, Any]]) -> str:
"""Format a CDP Accessibility.getFullAXTree result into an indented text tree.
@@ -102,10 +201,13 @@ def register_inspection_tools(mcp: FastMCP) -> None:
full_page: bool = False,
selector: str | None = None,
image_type: Literal["png", "jpeg"] = "png",
) -> dict:
) -> list:
"""
Take a screenshot of the current page.
Returns the screenshot as an image the LLM can see, alongside
text metadata (URL, size, etc.).
Args:
target_id: Tab ID (default: active tab)
profile: Browser profile name (default: "default")
@@ -114,18 +216,32 @@ def register_inspection_tools(mcp: FastMCP) -> None:
image_type: Image format - png or jpeg (default: png)
Returns:
Dict with screenshot data (base64 encoded) and metadata
List of content blocks: text metadata + image
"""
try:
session = get_session(profile)
page = session.get_page(target_id)
if not page:
return {"ok": False, "error": "No active tab"}
return [
TextContent(
type="text", text=json.dumps({"ok": False, "error": "No active tab"})
)
]
if selector:
from ..refs import resolve_selector
selector = resolve_selector(selector, session, target_id)
element = await page.query_selector(selector)
if not element:
return {"ok": False, "error": f"Element not found: {selector}"}
return [
TextContent(
type="text",
text=json.dumps(
{"ok": False, "error": f"Element not found: {selector}"}
),
)
]
screenshot_bytes = await element.screenshot(type=image_type)
else:
screenshot_bytes = await page.screenshot(
@@ -133,16 +249,31 @@ def register_inspection_tools(mcp: FastMCP) -> None:
type=image_type,
)
return {
normalized_bytes, normalized_type = _normalize_screenshot(screenshot_bytes, image_type)
meta = json.dumps(
{
"ok": True,
"targetId": target_id or session.active_page_id,
"url": page.url,
"imageType": image_type,
"imageBase64": base64.b64encode(screenshot_bytes).decode(),
"size": len(screenshot_bytes),
"imageType": normalized_type,
"size": len(normalized_bytes),
"originalSize": len(screenshot_bytes),
}
)
return [
TextContent(type="text", text=meta),
ImageContent(
type="image",
data=base64.b64encode(normalized_bytes).decode(),
mimeType=f"image/{normalized_type}",
),
]
except PlaywrightError as e:
return {"ok": False, "error": f"Browser error: {e!s}"}
return [
TextContent(
type="text", text=json.dumps({"ok": False, "error": f"Browser error: {e!s}"})
)
]
@mcp.tool()
async def browser_snapshot(
@@ -196,6 +327,13 @@ def register_inspection_tools(mcp: FastMCP) -> None:
await cdp.detach()
else:
snapshot = await page.locator(":root").aria_snapshot()
# Annotate with [ref=eN] markers for interactive elements
from ..refs import annotate_snapshot
snapshot, ref_map = annotate_snapshot(snapshot)
tid = target_id or session.active_page_id
if tid:
session.ref_maps[tid] = ref_map
return {
"ok": True,
+70 -8
View File
@@ -17,7 +17,8 @@ from playwright.async_api import (
)
from ..highlight import highlight_coordinate, highlight_element
from ..session import DEFAULT_TIMEOUT_MS, get_session
from ..refs import annotate_snapshot, resolve_selector
from ..session import DEFAULT_TIMEOUT_MS, BrowserSession, get_session
logger = logging.getLogger(__name__)
@@ -27,6 +28,8 @@ _AUTO_SNAPSHOT_MAX_CHARS = 4000
async def _auto_snapshot(
page: Page,
*,
session: BrowserSession | None = None,
target_id: str | None = None,
wait_for_nav: bool = False,
max_chars: int = _AUTO_SNAPSHOT_MAX_CHARS,
) -> str | None:
@@ -34,6 +37,8 @@ async def _auto_snapshot(
Args:
page: Playwright Page instance.
session: BrowserSession to store ref maps in.
target_id: Target page id for ref map storage.
wait_for_nav: If True, briefly wait for any in-flight navigation to
settle before snapshotting. Used after click actions that may
trigger page navigation.
@@ -48,6 +53,14 @@ async def _auto_snapshot(
except Exception:
pass # No navigation happened — that's fine
snapshot = await page.locator(":root").aria_snapshot()
# Annotate with refs before truncation so the full RefMap is captured
if snapshot and session:
snapshot, ref_map = annotate_snapshot(snapshot)
tid = target_id or session.active_page_id
if tid:
session.ref_maps[tid] = ref_map
if snapshot and max_chars > 0 and len(snapshot) > max_chars:
snapshot = (
snapshot[:max_chars]
@@ -96,6 +109,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
if not page:
return {"ok": False, "error": "No active tab"}
try:
selector = resolve_selector(selector, session, target_id)
except ValueError as e:
return {"ok": False, "error": str(e)}
await highlight_element(page, selector)
if double_click:
@@ -105,7 +123,12 @@ def register_interaction_tools(mcp: FastMCP) -> None:
result: dict = {"ok": True, "action": "click", "selector": selector}
if auto_snapshot:
snapshot = await _auto_snapshot(page, wait_for_nav=True)
snapshot = await _auto_snapshot(
page,
session=session,
target_id=target_id,
wait_for_nav=True,
)
if snapshot:
result["snapshot"] = snapshot
result["url"] = page.url
@@ -151,7 +174,12 @@ def register_interaction_tools(mcp: FastMCP) -> None:
await page.mouse.click(x, y, button=button)
result: dict = {"ok": True, "action": "click_coordinate", "x": x, "y": y}
if auto_snapshot:
snapshot = await _auto_snapshot(page, wait_for_nav=True)
snapshot = await _auto_snapshot(
page,
session=session,
target_id=target_id,
wait_for_nav=True,
)
if snapshot:
result["snapshot"] = snapshot
result["url"] = page.url
@@ -194,6 +222,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
if not page:
return {"ok": False, "error": "No active tab"}
try:
selector = resolve_selector(selector, session, target_id)
except ValueError as e:
return {"ok": False, "error": str(e)}
await highlight_element(page, selector)
if clear_first:
@@ -202,7 +235,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
await page.type(selector, text, delay=delay_ms, timeout=timeout_ms)
result: dict = {"ok": True, "action": "type", "selector": selector, "length": len(text)}
if auto_snapshot:
snapshot = await _auto_snapshot(page)
snapshot = await _auto_snapshot(page, session=session, target_id=target_id)
if snapshot:
result["snapshot"] = snapshot
result["url"] = page.url
@@ -244,12 +277,17 @@ def register_interaction_tools(mcp: FastMCP) -> None:
if not page:
return {"ok": False, "error": "No active tab"}
try:
selector = resolve_selector(selector, session, target_id)
except ValueError as e:
return {"ok": False, "error": str(e)}
await highlight_element(page, selector)
await page.fill(selector, value, timeout=timeout_ms)
result: dict = {"ok": True, "action": "fill", "selector": selector}
if auto_snapshot:
snapshot = await _auto_snapshot(page)
snapshot = await _auto_snapshot(page, session=session, target_id=target_id)
if snapshot:
result["snapshot"] = snapshot
result["url"] = page.url
@@ -287,6 +325,10 @@ def register_interaction_tools(mcp: FastMCP) -> None:
return {"ok": False, "error": "No active tab"}
if selector:
try:
selector = resolve_selector(selector, session, target_id)
except ValueError as e:
return {"ok": False, "error": str(e)}
await page.press(selector, key, timeout=timeout_ms)
else:
await page.keyboard.press(key)
@@ -322,6 +364,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
if not page:
return {"ok": False, "error": "No active tab"}
try:
selector = resolve_selector(selector, session, target_id)
except ValueError as e:
return {"ok": False, "error": str(e)}
await page.hover(selector, timeout=timeout_ms)
return {"ok": True, "action": "hover", "selector": selector}
except PlaywrightTimeout:
@@ -360,6 +407,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
if not page:
return {"ok": False, "error": "No active tab"}
try:
selector = resolve_selector(selector, session, target_id)
except ValueError as e:
return {"ok": False, "error": str(e)}
selected = await page.select_option(selector, values, timeout=timeout_ms)
result: dict = {
"ok": True,
@@ -368,7 +420,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
"selected": selected,
}
if auto_snapshot:
snapshot = await _auto_snapshot(page)
snapshot = await _auto_snapshot(page, session=session, target_id=target_id)
if snapshot:
result["snapshot"] = snapshot
result["url"] = page.url
@@ -422,6 +474,10 @@ def register_interaction_tools(mcp: FastMCP) -> None:
delta_x = -amount
if selector:
try:
selector = resolve_selector(selector, session, target_id)
except ValueError as e:
return {"ok": False, "error": str(e)}
element = await page.query_selector(selector)
if element:
await element.evaluate(f"e => e.scrollBy({delta_x}, {delta_y})")
@@ -435,7 +491,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
"amount": amount,
}
if auto_snapshot:
snapshot = await _auto_snapshot(page)
snapshot = await _auto_snapshot(page, session=session, target_id=target_id)
if snapshot:
result["snapshot"] = snapshot
result["url"] = page.url
@@ -474,6 +530,12 @@ def register_interaction_tools(mcp: FastMCP) -> None:
if not page:
return {"ok": False, "error": "No active tab"}
try:
start_selector = resolve_selector(start_selector, session, target_id)
end_selector = resolve_selector(end_selector, session, target_id)
except ValueError as e:
return {"ok": False, "error": str(e)}
await page.drag_and_drop(
start_selector,
end_selector,
@@ -486,7 +548,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
"to": end_selector,
}
if auto_snapshot:
snapshot = await _auto_snapshot(page)
snapshot = await _auto_snapshot(page, session=session, target_id=target_id)
if snapshot:
result["snapshot"] = snapshot
result["url"] = page.url
+187
View File
@@ -0,0 +1,187 @@
"""Tests for the browser ref system (annotate_snapshot / resolve_ref)."""
from __future__ import annotations
import pytest
from gcu.browser.refs import (
RefEntry,
annotate_snapshot,
resolve_ref,
)
# ---------------------------------------------------------------------------
# annotate_snapshot
# ---------------------------------------------------------------------------
SAMPLE_SNAPSHOT = """\
- navigation "Main":
- link "Home"
- link "About"
- main:
- heading "Welcome"
- textbox "Search"
- button "Submit"
- paragraph: some text here
- img "Logo"
- list:
- listitem:
- link "Item 1"
- listitem:
- link "Item 2\""""
class TestAnnotateSnapshot:
def test_assigns_refs_to_interactive_roles(self):
annotated, ref_map = annotate_snapshot(SAMPLE_SNAPSHOT)
# link, textbox, button should all get refs
assert "[ref=e" in annotated
# Check that specific interactive elements got refs
roles_in_map = {entry.role for entry in ref_map.values()}
assert "link" in roles_in_map
assert "textbox" in roles_in_map
assert "button" in roles_in_map
def test_skips_structural_roles(self):
annotated, ref_map = annotate_snapshot(SAMPLE_SNAPSHOT)
roles_in_map = {entry.role for entry in ref_map.values()}
# navigation, main, list, listitem, paragraph are structural — no refs
assert "navigation" not in roles_in_map
assert "main" not in roles_in_map
assert "list" not in roles_in_map
assert "listitem" not in roles_in_map
assert "paragraph" not in roles_in_map
def test_named_content_roles_get_refs(self):
annotated, ref_map = annotate_snapshot(SAMPLE_SNAPSHOT)
roles_in_map = {entry.role for entry in ref_map.values()}
# heading and img have names, so they should get refs
assert "heading" in roles_in_map
assert "img" in roles_in_map
def test_unnamed_content_roles_skip(self):
snapshot = "- heading\n- img"
_, ref_map = annotate_snapshot(snapshot)
# No names → no refs for content roles
assert len(ref_map) == 0
def test_preserves_non_matching_lines(self):
snapshot = 'some random text\n- button "OK"\nanother line'
annotated, _ = annotate_snapshot(snapshot)
lines = annotated.split("\n")
assert lines[0] == "some random text"
assert lines[2] == "another line"
def test_nth_disambiguation(self):
snapshot = '- button "Save"\n- button "Save"\n- button "Cancel"'
annotated, ref_map = annotate_snapshot(snapshot)
# Two "Save" buttons should have nth=0 and nth=1
save_entries = [
(rid, e) for rid, e in ref_map.items() if e.role == "button" and e.name == "Save"
]
assert len(save_entries) == 2
nths = sorted(e.nth for _, e in save_entries)
assert nths == [0, 1]
# "Cancel" should have nth=0
cancel_entries = [e for e in ref_map.values() if e.role == "button" and e.name == "Cancel"]
assert len(cancel_entries) == 1
assert cancel_entries[0].nth == 0
def test_sequential_ref_ids(self):
snapshot = '- link "A"\n- link "B"\n- link "C"'
_, ref_map = annotate_snapshot(snapshot)
assert set(ref_map.keys()) == {"e0", "e1", "e2"}
def test_empty_snapshot(self):
annotated, ref_map = annotate_snapshot("")
assert annotated == ""
assert ref_map == {}
# ---------------------------------------------------------------------------
# resolve_ref
# ---------------------------------------------------------------------------
class TestResolveRef:
def test_resolves_valid_ref(self):
ref_map = {
"e0": RefEntry(role="button", name="Submit", nth=0),
}
result = resolve_ref("e0", ref_map)
assert result == 'role=button[name="Submit"] >> nth=0'
def test_passes_through_css_selectors(self):
ref_map = {"e0": RefEntry(role="button", name="OK", nth=0)}
assert resolve_ref("#my-button", ref_map) == "#my-button"
assert resolve_ref(".btn-primary", ref_map) == ".btn-primary"
assert resolve_ref("div > button", ref_map) == "div > button"
def test_passes_through_role_selectors(self):
ref_map = {"e0": RefEntry(role="button", name="OK", nth=0)}
sel = 'role=button[name="OK"]'
assert resolve_ref(sel, ref_map) == sel
def test_raises_on_unknown_ref(self):
ref_map = {"e0": RefEntry(role="button", name="OK", nth=0)}
with pytest.raises(ValueError, match="not found"):
resolve_ref("e99", ref_map)
def test_raises_when_no_ref_map(self):
with pytest.raises(ValueError, match="no snapshot"):
resolve_ref("e0", None)
def test_escapes_quotes_in_name(self):
ref_map = {
"e0": RefEntry(role="button", name='Say "Hello"', nth=0),
}
result = resolve_ref("e0", ref_map)
assert result == 'role=button[name="Say \\"Hello\\""] >> nth=0'
def test_no_name_produces_role_only_selector(self):
ref_map = {
"e0": RefEntry(role="textbox", name=None, nth=0),
}
result = resolve_ref("e0", ref_map)
assert result == "role=textbox >> nth=0"
def test_empty_name(self):
ref_map = {
"e0": RefEntry(role="button", name="", nth=0),
}
result = resolve_ref("e0", ref_map)
assert result == 'role=button[name=""] >> nth=0'
def test_nth_in_selector(self):
ref_map = {
"e0": RefEntry(role="link", name="Next", nth=2),
}
result = resolve_ref("e0", ref_map)
assert result == 'role=link[name="Next"] >> nth=2'
# ---------------------------------------------------------------------------
# Round-trip: annotate → resolve
# ---------------------------------------------------------------------------
class TestRoundTrip:
def test_annotate_then_resolve(self):
snapshot = '- button "Submit"\n- textbox "Email"\n- link "Home"'
_, ref_map = annotate_snapshot(snapshot)
# Each ref should resolve to a valid Playwright role selector
for ref_id, entry in ref_map.items():
resolved = resolve_ref(ref_id, ref_map)
assert resolved.startswith(f"role={entry.role}")
if entry.name is not None:
assert f'name="{entry.name}"' in resolved
assert f"nth={entry.nth}" in resolved
def test_css_selectors_still_work_after_annotate(self):
snapshot = '- button "OK"'
_, ref_map = annotate_snapshot(snapshot)
# CSS selectors pass through even when a ref_map exists
assert resolve_ref("#submit-btn", ref_map) == "#submit-btn"
@@ -0,0 +1,159 @@
"""Tests for screenshot normalization.
Requires the ``browser`` extra (Pillow). Skipped automatically when
Pillow is not installed.
"""
from __future__ import annotations
import io
from unittest.mock import patch
import pytest
Image = pytest.importorskip(
"PIL.Image", reason="Pillow not installed (install with: pip install pillow)"
)
from gcu.browser.tools.inspection import _normalize_screenshot # noqa: E402
def _make_png(width: int, height: int, *, mode: str = "RGB") -> bytes:
"""Create a solid-color PNG image of the given size."""
img = Image.new(
mode, (width, height), color=(100, 150, 200) if mode == "RGB" else (100, 150, 200, 128)
)
buf = io.BytesIO()
img.save(buf, format="PNG")
return buf.getvalue()
def _make_large_png(width: int, height: int, min_bytes: int) -> bytes:
"""Create a PNG that's at least *min_bytes* by using random-ish pixel data."""
# Gradient with noise produces poorly-compressible PNGs
img = Image.new("RGB", (width, height))
pixels = img.load()
for y in range(height):
for x in range(width):
pixels[x, y] = ((x * 7 + y * 13) % 256, (x * 11 + y * 3) % 256, (x * 5 + y * 17) % 256)
buf = io.BytesIO()
img.save(buf, format="PNG")
raw = buf.getvalue()
# If still under target, that's fine for most tests — the important
# thing is we have a large-dimension image.
return raw
class TestPassthrough:
"""Images already within limits should pass through unchanged."""
def test_small_image_unchanged(self):
raw = _make_png(100, 100)
result_bytes, result_type = _normalize_screenshot(raw, "png")
assert result_bytes is raw
assert result_type == "png"
def test_within_dimension_and_size_unchanged(self):
raw = _make_png(1920, 1080)
result_bytes, result_type = _normalize_screenshot(raw, "png")
assert result_bytes is raw
assert result_type == "png"
class TestDimensionResize:
"""Images exceeding max_dimension should be resized."""
def test_large_dimension_gets_resized(self):
raw = _make_png(4000, 3000)
result_bytes, result_type = _normalize_screenshot(raw, "png")
# Should be JPEG after normalization
assert result_type == "jpeg"
# Verify dimensions are within limit
img = Image.open(io.BytesIO(result_bytes))
assert max(img.size) <= 2000
def test_custom_max_dimension(self):
raw = _make_png(2000, 1500)
result_bytes, result_type = _normalize_screenshot(raw, "png", max_dimension=800)
assert result_type == "jpeg"
img = Image.open(io.BytesIO(result_bytes))
assert max(img.size) <= 800
def test_aspect_ratio_preserved(self):
raw = _make_png(4000, 2000) # 2:1 ratio
result_bytes, _ = _normalize_screenshot(raw, "png")
img = Image.open(io.BytesIO(result_bytes))
w, h = img.size
ratio = w / h
assert abs(ratio - 2.0) < 0.1 # Allow small rounding error
class TestSizeCompression:
"""Images exceeding max_bytes should be compressed."""
def test_custom_max_bytes(self):
raw = _make_large_png(1500, 1500, min_bytes=100_000)
result_bytes, result_type = _normalize_screenshot(raw, "png", max_bytes=50_000)
assert result_type == "jpeg"
assert len(result_bytes) <= 50_000
def test_over_size_within_dimension_compresses(self):
"""Image within dimension limit but over byte limit gets JPEG-compressed."""
raw = _make_large_png(1800, 1800, min_bytes=100_000)
result_bytes, result_type = _normalize_screenshot(raw, "png", max_bytes=50_000)
assert result_type == "jpeg"
assert len(result_bytes) <= 50_000
class TestAlphaChannel:
"""RGBA images should be converted to RGB for JPEG output."""
def test_rgba_to_rgb(self):
raw = _make_png(4000, 3000, mode="RGBA")
result_bytes, result_type = _normalize_screenshot(raw, "png")
assert result_type == "jpeg"
img = Image.open(io.BytesIO(result_bytes))
assert img.mode == "RGB"
class TestGracefulDegradation:
"""Normalization should never break screenshots."""
def test_pillow_not_available(self):
raw = _make_png(4000, 3000)
with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}):
# Need to force reimport failure — patch builtins.__import__
original_import = (
__builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__
)
def mock_import(name, *args, **kwargs):
if name == "PIL" or name.startswith("PIL."):
raise ImportError("No module named 'PIL'")
return original_import(name, *args, **kwargs)
with patch("builtins.__import__", side_effect=mock_import):
result_bytes, result_type = _normalize_screenshot(raw, "png")
# Should return original unchanged
assert result_bytes is raw
assert result_type == "png"
def test_corrupt_bytes_returns_original(self):
raw = b"not an image at all"
result_bytes, result_type = _normalize_screenshot(raw, "png")
assert result_bytes is raw
assert result_type == "png"
def test_empty_bytes_returns_original(self):
raw = b""
result_bytes, result_type = _normalize_screenshot(raw, "png")
assert result_bytes is raw
assert result_type == "png"
Generated
+5 -1
View File
@@ -3523,6 +3523,9 @@ all = [
bigquery = [
{ name = "google-cloud-bigquery" },
]
browser = [
{ name = "pillow" },
]
databricks = [
{ name = "databricks-mcp" },
{ name = "databricks-sdk" },
@@ -3577,6 +3580,7 @@ requires-dist = [
{ name = "openpyxl", marker = "extra == 'excel'", specifier = ">=3.1.0" },
{ name = "pandas", specifier = ">=2.0.0" },
{ name = "pillow", marker = "extra == 'all'", specifier = ">=10.0.0" },
{ name = "pillow", marker = "extra == 'browser'", specifier = ">=10.0.0" },
{ name = "pillow", marker = "extra == 'ocr'", specifier = ">=10.0.0" },
{ name = "playwright", specifier = ">=1.40.0" },
{ name = "playwright-stealth", specifier = ">=1.0.5" },
@@ -3594,7 +3598,7 @@ requires-dist = [
{ name = "restrictedpython", marker = "extra == 'sandbox'", specifier = ">=7.0" },
{ name = "stripe", specifier = ">=14.3.0" },
]
provides-extras = ["dev", "sandbox", "ocr", "excel", "sql", "bigquery", "databricks", "all"]
provides-extras = ["dev", "sandbox", "browser", "ocr", "excel", "sql", "bigquery", "databricks", "all"]
[package.metadata.requires-dev]
dev = [