Merge pull request #6682 from aden-hive/feat/image-capabilities

feat: image capabilities — upload, screenshot passthrough, vision detection & fallback, aria refs
2026-03-20 21:25:37 -07:00
parent f0c7470f3d 736756b257
commit 3e1282b31e
28 changed files with 1586 additions and 178 deletions
@@ -702,6 +702,15 @@ stop_worker() to return to STAGING phase.
 _queen_behavior_always = """
 # Behavior

+## Images attached by the user
+
+Users can attach images directly to their chat messages. When you see an \
+image in the conversation, analyze it using your native vision capability — \
+do NOT say you cannot see images or that you lack access to files. The image \
+is embedded in the message; no tool call is needed to view it. Describe what \
+you see, answer questions about it, and use the visual content to inform your \
+response just as you would text.
+
 ## CRITICAL RULE — ask_user / ask_user_multiple

 Every response that ends with a question, a prompt, or expects user \
@@ -150,7 +150,7 @@ Call all three subagents in a single response to run them in parallel:

 ## GCU Anti-Patterns

- Using `browser_screenshot` to read text (use `browser_snapshot`)
+- Using `browser_screenshot` to read text (use `browser_snapshot` instead; screenshots are for visual context only)
 - Re-navigating after scrolling (resets scroll position)
 - Attempting login on auth walls
 - Forgetting `target_id` in multi-tab scenarios
@@ -33,12 +33,20 @@ class Message:
    is_transition_marker: bool = False
    # True when this message is real human input (from /chat), not a system prompt
    is_client_input: bool = False
+    # Optional image content blocks (e.g. from browser_screenshot)
+    image_content: list[dict[str, Any]] | None = None
    # True when message contains an activated skill body (AS-10: never prune)
    is_skill_content: bool = False

    def to_llm_dict(self) -> dict[str, Any]:
        """Convert to OpenAI-format message dict."""
        if self.role == "user":
+            if self.image_content:
+                blocks: list[dict[str, Any]] = []
+                if self.content:
+                    blocks.append({"type": "text", "text": self.content})
+                blocks.extend(self.image_content)
+                return {"role": "user", "content": blocks}
            return {"role": "user", "content": self.content}

        if self.role == "assistant":
@@ -49,6 +57,15 @@ class Message:

        # role == "tool"
        content = f"ERROR: {self.content}" if self.is_error else self.content
+        if self.image_content:
+            # Multimodal tool result: text + image content blocks
+            blocks: list[dict[str, Any]] = [{"type": "text", "text": content}]
+            blocks.extend(self.image_content)
+            return {
+                "role": "tool",
+                "tool_call_id": self.tool_use_id,
+                "content": blocks,
+            }
        return {
            "role": "tool",
            "tool_call_id": self.tool_use_id,
@@ -74,6 +91,8 @@ class Message:
            d["is_transition_marker"] = self.is_transition_marker
        if self.is_client_input:
            d["is_client_input"] = self.is_client_input
+        if self.image_content is not None:
+            d["image_content"] = self.image_content
        return d

    @classmethod
@@ -89,6 +108,7 @@ class Message:
            phase_id=data.get("phase_id"),
            is_transition_marker=data.get("is_transition_marker", False),
            is_client_input=data.get("is_client_input", False),
+            image_content=data.get("image_content"),
        )


@@ -375,6 +395,7 @@ class NodeConversation:
        *,
        is_transition_marker: bool = False,
        is_client_input: bool = False,
+        image_content: list[dict[str, Any]] | None = None,
    ) -> Message:
        msg = Message(
            seq=self._next_seq,
@@ -383,6 +404,7 @@ class NodeConversation:
            phase_id=self._current_phase,
            is_transition_marker=is_transition_marker,
            is_client_input=is_client_input,
+            image_content=image_content,
        )
        self._messages.append(msg)
        self._next_seq += 1
@@ -411,6 +433,7 @@ class NodeConversation:
        tool_use_id: str,
        content: str,
        is_error: bool = False,
+        image_content: list[dict[str, Any]] | None = None,
        is_skill_content: bool = False,
    ) -> Message:
        msg = Message(
@@ -420,6 +443,7 @@ class NodeConversation:
            tool_use_id=tool_use_id,
            is_error=is_error,
            phase_id=self._current_phase,
+            image_content=image_content,
            is_skill_content=is_skill_content,
        )
        self._messages.append(msg)
@@ -14,6 +14,7 @@ from __future__ import annotations
 import asyncio
 import json
 import logging
+import os
 import re
 import time
 from collections.abc import Awaitable, Callable
@@ -24,6 +25,7 @@ from typing import Any, Literal, Protocol, runtime_checkable

 from framework.graph.conversation import ConversationStore, NodeConversation
 from framework.graph.node import NodeContext, NodeProtocol, NodeResult
+from framework.llm.capabilities import supports_image_tool_results
 from framework.llm.provider import Tool, ToolResult, ToolUse
 from framework.llm.stream_events import (
    FinishEvent,
@@ -37,6 +39,56 @@ from framework.runtime.llm_debug_logger import log_llm_turn
 logger = logging.getLogger(__name__)


+async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> str | None:
+    """Describe images using the best available vision model.
+
+    Called when the queen's model lacks vision support.  Tries vision-capable
+    models in priority order based on available API keys and returns a bracketed
+    description to inject into the message text, or None if no vision model is
+    reachable.
+    """
+    import litellm
+
+    # Build content blocks: prompt + all images
+    blocks: list[dict[str, Any]] = [
+        {
+            "type": "text",
+            "text": (
+                "Describe the following image(s) concisely but with enough detail "
+                "that a text-only AI assistant can understand the content and context."
+            ),
+        }
+    ]
+    blocks.extend(image_content)
+
+    # Ordered candidates based on available env vars
+    candidates: list[str] = []
+    if os.environ.get("OPENAI_API_KEY"):
+        candidates.append("gpt-4o-mini")
+    if os.environ.get("ANTHROPIC_API_KEY"):
+        candidates.append("claude-3-haiku-20240307")
+    if os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY"):
+        candidates.append("gemini/gemini-1.5-flash")
+
+    for model in candidates:
+        try:
+            response = await litellm.acompletion(
+                model=model,
+                messages=[{"role": "user", "content": blocks}],
+                max_tokens=512,
+            )
+            description = (response.choices[0].message.content or "").strip()
+            if description:
+                count = len(image_content)
+                label = "image" if count == 1 else f"{count} images"
+                return f"[{label} attached — description: {description}]"
+        except Exception as exc:
+            logger.debug("Vision fallback model '%s' failed: %s", model, exc)
+            continue
+
+    return None
+
+
@dataclass
 class TriggerEvent:
    """A framework-level trigger signal (timer tick or webhook hit).
@@ -90,7 +142,13 @@ class _EscalationReceiver:
        self._response: str | None = None
        self._awaiting_input = True  # So inject_worker_message() can prefer us

-    async def inject_event(self, content: str, *, is_client_input: bool = False) -> None:
+    async def inject_event(
+        self,
+        content: str,
+        *,
+        is_client_input: bool = False,
+        image_content: list[dict] | None = None,
+    ) -> None:
        """Called by ExecutionStream.inject_input() when the user responds."""
        self._response = content
        self._event.set()
@@ -426,7 +484,9 @@ class EventLoopNode(NodeProtocol):
        self._config = config or LoopConfig()
        self._tool_executor = tool_executor
        self._conversation_store = conversation_store
-        self._injection_queue: asyncio.Queue[tuple[str, bool]] = asyncio.Queue()
+        self._injection_queue: asyncio.Queue[tuple[str, bool, list[dict[str, Any]] | None]] = (
+            asyncio.Queue()
+        )
        self._trigger_queue: asyncio.Queue[TriggerEvent] = asyncio.Queue()
        # Client-facing input blocking state
        self._input_ready = asyncio.Event()
@@ -784,7 +844,7 @@ class EventLoopNode(NodeProtocol):
                )

            # 6b. Drain injection queue
-            await self._drain_injection_queue(conversation)
+            await self._drain_injection_queue(conversation, ctx)
            # 6b1. Drain trigger queue (framework-level signals)
            await self._drain_trigger_queue(conversation)

@@ -1910,7 +1970,13 @@ class EventLoopNode(NodeProtocol):
            conversation=conversation if _is_continuous else None,
        )

-    async def inject_event(self, content: str, *, is_client_input: bool = False) -> None:
+    async def inject_event(
+        self,
+        content: str,
+        *,
+        is_client_input: bool = False,
+        image_content: list[dict[str, Any]] | None = None,
+    ) -> None:
        """Inject an external event or user input into the running loop.

        The content becomes a user message prepended to the next iteration.
@@ -1926,8 +1992,10 @@ class EventLoopNode(NodeProtocol):
                human user (e.g. /chat endpoint), False for external events
                (e.g. worker question forwarded by the frontend).  Controls
                message formatting in _drain_injection_queue, not wake behavior.
+            image_content: Optional list of image content blocks (OpenAI
+                image_url format) to include alongside the text.
        """
-        await self._injection_queue.put((content, is_client_input))
+        await self._injection_queue.put((content, is_client_input, image_content))
        self._input_ready.set()

    async def inject_trigger(self, trigger: TriggerEvent) -> None:
@@ -2101,6 +2169,24 @@ class EventLoopNode(NodeProtocol):

            messages = conversation.to_llm_messages()

+            # Debug: log whether the last user message contains image blocks
+            for _m in reversed(messages):
+                if _m.get("role") == "user":
+                    _content = _m.get("content")
+                    if isinstance(_content, list):
+                        _img_count = sum(
+                            1
+                            for _b in _content
+                            if isinstance(_b, dict) and _b.get("type") == "image_url"
+                        )
+                        if _img_count:
+                            logger.info(
+                                "[%s] LLM call: last user message has %d image block(s)",
+                                node_id,
+                                _img_count,
+                            )
+                    break
+
            # Defensive guard: ensure messages don't end with an assistant
            # message.  The Anthropic API rejects "assistant message prefill"
            # (conversations must end with a user or tool message).  This can
@@ -2770,10 +2856,21 @@ class EventLoopNode(NodeProtocol):
                    real_tool_results.append(tool_entry)
                    logged_tool_calls.append(tool_entry)

+                # Strip image content for models that can't handle it
+                image_content = result.image_content
+                if image_content and ctx.llm and not supports_image_tool_results(ctx.llm.model):
+                    logger.info(
+                        "Stripping image_content from tool result — model '%s' "
+                        "does not support images in tool results",
+                        ctx.llm.model,
+                    )
+                    image_content = None
+
                await conversation.add_tool_result(
                    tool_use_id=tc.tool_use_id,
                    content=result.content,
                    is_error=result.is_error,
+                    image_content=image_content,
                    is_skill_content=result.is_skill_content,
                )
                if (
@@ -3914,6 +4011,7 @@ class EventLoopNode(NodeProtocol):
                tool_use_id=result.tool_use_id,
                content=truncated,
                is_error=False,
+                image_content=result.image_content,
            )

        spill_dir = self._config.spillover_dir
@@ -3986,6 +4084,7 @@ class EventLoopNode(NodeProtocol):
                tool_use_id=result.tool_use_id,
                content=content,
                is_error=False,
+                image_content=result.image_content,
            )

        # No spillover_dir — truncate in-place if needed
@@ -4028,6 +4127,7 @@ class EventLoopNode(NodeProtocol):
                tool_use_id=result.tool_use_id,
                content=truncated,
                is_error=False,
+                image_content=result.image_content,
            )

        return result
@@ -4698,20 +4798,37 @@ class EventLoopNode(NodeProtocol):
                ]
            await self._conversation_store.write_cursor(cursor)

-    async def _drain_injection_queue(self, conversation: NodeConversation) -> int:
+    async def _drain_injection_queue(self, conversation: NodeConversation, ctx: NodeContext) -> int:
        """Drain all pending injected events as user messages. Returns count."""
        count = 0
        while not self._injection_queue.empty():
            try:
-                content, is_client_input = self._injection_queue.get_nowait()
+                content, is_client_input, image_content = self._injection_queue.get_nowait()
                logger.info(
-                    "[drain] injected message (client_input=%s): %s",
+                    "[drain] injected message (client_input=%s, images=%d): %s",
                    is_client_input,
+                    len(image_content) if image_content else 0,
                    content[:200] if content else "(empty)",
                )
+                # For models that don't support images, fall back to a text description
+                if image_content and ctx.llm:
+                    if not supports_image_tool_results(ctx.llm.model):
+                        logger.info(
+                            "Model '%s' does not support images — attempting vision fallback",
+                            ctx.llm.model,
+                        )
+                        description = await _describe_images_as_text(image_content)
+                        if description:
+                            content = f"{content}\n\n{description}" if content else description
+                            logger.info("[drain] image described as text via vision fallback")
+                        else:
+                            logger.info("[drain] no vision fallback available — images dropped")
+                        image_content = None
                # Real user input is stored as-is; external events get a prefix
                if is_client_input:
-                    await conversation.add_user_message(content, is_client_input=True)
+                    await conversation.add_user_message(
+                        content, is_client_input=True, image_content=image_content
+                    )
                else:
                    await conversation.add_user_message(f"[External event]: {content}")
                count += 1
@@ -43,8 +43,11 @@ Follow these rules for reliable, efficient browser interaction.
  `browser_snapshot` separately after every action.
  Only call `browser_snapshot` when you need a fresh view without
  performing an action, or after setting `auto_snapshot=false`.
- Do NOT use `browser_screenshot` for reading text content
-  — it produces huge base64 images with no searchable text.
+- Do NOT use `browser_screenshot` to read text — use
+  `browser_snapshot` for that (compact, searchable, fast).
+- DO use `browser_screenshot` when you need visual context:
+  charts, images, canvas elements, layout verification, or when
+  the snapshot doesn't capture what you need.
 - Only fall back to `browser_get_text` for extracting specific
  small elements by CSS selector.

@@ -0,0 +1,106 @@
+"""Model capability checks for LLM providers.
+
+Vision support rules are derived from official vendor documentation:
+- ZAI (z.ai): docs.z.ai/guides/vlm — GLM-4.6V variants are vision; GLM-5/4.6/4.7 are text-only
+- MiniMax: platform.minimax.io/docs — minimax-vl-01 is vision; M2.x are text-only
+- DeepSeek: api-docs.deepseek.com — deepseek-vl2 is vision; chat/reasoner are text-only
+- Cerebras: inference-docs.cerebras.ai — no vision models at all
+- Groq: console.groq.com/docs/vision — vision capable; treat as supported by default
+- Ollama/LM Studio/vLLM/llama.cpp: local runners denied by default; model names
+  don't reliably indicate vision support, so users must configure explicitly
+"""
+
+from __future__ import annotations
+
+
+def _model_name(model: str) -> str:
+    """Return the bare model name after stripping any 'provider/' prefix."""
+    if "/" in model:
+        return model.split("/", 1)[1]
+    return model
+
+
+# Step 1: explicit vision allow-list — these always support images regardless
+# of what the provider-level rules say.  Checked first so that e.g. glm-4.6v
+# is allowed even though glm-4.6 is denied.
+_VISION_ALLOW_BARE_PREFIXES: tuple[str, ...] = (
+    # ZAI/GLM vision models (docs.z.ai/guides/vlm)
+    "glm-4v",  # GLM-4V series (legacy)
+    "glm-4.6v",  # GLM-4.6V, GLM-4.6V-flash, GLM-4.6V-flashx
+    # DeepSeek vision models
+    "deepseek-vl",  # deepseek-vl2, deepseek-vl2-small, deepseek-vl2-tiny
+    # MiniMax vision model
+    "minimax-vl",  # minimax-vl-01
+)
+
+# Step 2: provider-level deny — every model from this provider is text-only.
+_TEXT_ONLY_PROVIDER_PREFIXES: tuple[str, ...] = (
+    # Cerebras: inference-docs.cerebras.ai lists only text models
+    "cerebras/",
+    # Local runners: model names don't reliably indicate vision support
+    "ollama/",
+    "ollama_chat/",
+    "lm_studio/",
+    "vllm/",
+    "llamacpp/",
+)
+
+# Step 3: per-model deny — text-only models within otherwise mixed providers.
+# Matched against the bare model name (provider prefix stripped, lower-cased).
+# The vision allow-list above is checked first, so vision variants of the same
+# family are already handled before these deny patterns are reached.
+_TEXT_ONLY_MODEL_BARE_PREFIXES: tuple[str, ...] = (
+    # --- ZAI / GLM family ---
+    # text-only: glm-5, glm-4.6, glm-4.7, glm-4.5, zai-glm-*
+    # vision:    glm-4v, glm-4.6v (caught by allow-list above)
+    "glm-5",
+    "glm-4.6",  # bare glm-4.6 is text-only; glm-4.6v is caught by allow-list
+    "glm-4.7",
+    "glm-4.5",
+    "zai-glm",
+    # --- DeepSeek ---
+    # text-only: deepseek-chat, deepseek-coder, deepseek-reasoner
+    # vision:    deepseek-vl2 (caught by allow-list above)
+    # Note: LiteLLM's deepseek handler may flatten content lists for some models;
+    # VL models are allowed through and rely on LiteLLM's native VL support.
+    "deepseek-chat",
+    "deepseek-coder",
+    "deepseek-reasoner",
+    # --- MiniMax ---
+    # text-only: minimax-m2.*, minimax-text-*, abab* (legacy)
+    # vision:    minimax-vl-01 (caught by allow-list above)
+    "minimax-m2",
+    "minimax-text",
+    "abab",
+)
+
+
+def supports_image_tool_results(model: str) -> bool:
+    """Return whether *model* can receive image content in messages.
+
+    Used to gate both user-message images and tool-result image blocks.
+
+    Logic (checked in order):
+    1. Vision allow-list  → True  (known vision model, skip all denies)
+    2. Provider deny      → False (entire provider is text-only)
+    3. Model deny         → False (specific text-only model within a mixed provider)
+    4. Default            → True  (assume capable; unknown providers and models)
+    """
+    model_lower = model.lower()
+    bare = _model_name(model_lower)
+
+    # 1. Explicit vision allow — takes priority over all denies
+    if any(bare.startswith(p) for p in _VISION_ALLOW_BARE_PREFIXES):
+        return True
+
+    # 2. Provider-level deny (all models from this provider are text-only)
+    if any(model_lower.startswith(p) for p in _TEXT_ONLY_PROVIDER_PREFIXES):
+        return False
+
+    # 3. Per-model deny (text-only variants within mixed-capability families)
+    if any(bare.startswith(p) for p in _TEXT_ONLY_MODEL_BARE_PREFIXES):
+        return False
+
+    # 5. Default: assume vision capable
+    #    Covers: OpenAI, Anthropic, Google, Mistral, Kimi, and other hosted providers
+    return True
@@ -45,6 +45,7 @@ class ToolResult:
    tool_use_id: str
    content: str
    is_error: bool = False
+    image_content: list[dict[str, Any]] | None = None
    is_skill_content: bool = False  # AS-10: marks activated skill body, protected from pruning


@@ -509,17 +509,30 @@ class MCPClient:
                    error_text = content_item.text
            raise RuntimeError(f"MCP tool '{tool_name}' failed: {error_text}")

-        # Extract content
+        # Extract content — preserve image blocks alongside text
        if result.content:
-            # MCP returns content as a list of content items
-            if len(result.content) > 0:
-                content_item = result.content[0]
-                # Check if it's a text content item
-                if hasattr(content_item, "text"):
-                    return content_item.text
-                elif hasattr(content_item, "data"):
-                    return content_item.data
-            return result.content
+            text_parts: list[str] = []
+            image_parts: list[dict[str, Any]] = []
+            for item in result.content:
+                if hasattr(item, "text"):
+                    text_parts.append(item.text)
+                elif hasattr(item, "data") and hasattr(item, "mimeType"):
+                    # MCP ImageContent — preserve as structured image block
+                    image_parts.append(
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:{item.mimeType};base64,{item.data}",
+                            },
+                        }
+                    )
+                elif hasattr(item, "data"):
+                    text_parts.append(str(item.data))
+
+            text = "\n".join(text_parts) if text_parts else ""
+            if image_parts:
+                return {"_text": text, "_images": image_parts}
+            return text if text else None

        return None

@@ -245,6 +245,13 @@ class ToolRegistry:
        def _wrap_result(tool_use_id: str, result: Any) -> ToolResult:
            if isinstance(result, ToolResult):
                return result
+            # MCP client returns dict with _images when image content is present
+            if isinstance(result, dict) and "_images" in result:
+                return ToolResult(
+                    tool_use_id=tool_use_id,
+                    content=result.get("_text", ""),
+                    image_content=result["_images"],
+                )
            return ToolResult(
                tool_use_id=tool_use_id,
                content=json.dumps(result) if not isinstance(result, str) else result,
@@ -572,7 +579,9 @@ class ToolRegistry:
                            }
                            merged_inputs = {**clean_inputs, **filtered_context}
                            result = client_ref.call_tool(tool_name, merged_inputs)
-                            # MCP tools return content array, extract the result
+                            # MCP client already extracts content (returns str
+                            # or {"_text": ..., "_images": ...} for image results).
+                            # Handle legacy list format from HTTP transport.
                            if isinstance(result, list) and len(result) > 0:
                                if isinstance(result[0], dict) and "text" in result[0]:
                                    return result[0]["text"]
@@ -1474,6 +1474,7 @@ class AgentRuntime:
        graph_id: str | None = None,
        *,
        is_client_input: bool = False,
+        image_content: list[dict[str, Any]] | None = None,
    ) -> bool:
        """Inject user input into a running client-facing node.

@@ -1486,6 +1487,8 @@ class AgentRuntime:
            graph_id: Optional graph to search first (defaults to active graph)
            is_client_input: True when the message originates from a real
                human user (e.g. /chat endpoint), False for external events.
+            image_content: Optional list of image content blocks (OpenAI
+                image_url format) to include alongside the text.

        Returns:
            True if input was delivered, False if no matching node found
@@ -1497,7 +1500,9 @@ class AgentRuntime:
        target = graph_id or self._active_graph_id
        if target in self._graphs:
            for stream in self._graphs[target].streams.values():
-                if await stream.inject_input(node_id, content, is_client_input=is_client_input):
+                if await stream.inject_input(
+                    node_id, content, is_client_input=is_client_input, image_content=image_content
+                ):
                    return True

        # Then search all other graphs
@@ -1505,7 +1510,9 @@ class AgentRuntime:
            if gid == target:
                continue
            for stream in reg.streams.values():
-                if await stream.inject_input(node_id, content, is_client_input=is_client_input):
+                if await stream.inject_input(
+                    node_id, content, is_client_input=is_client_input, image_content=image_content
+                ):
                    return True
        return False

@@ -433,6 +433,7 @@ class ExecutionStream:
        content: str,
        *,
        is_client_input: bool = False,
+        image_content: list[dict[str, Any]] | None = None,
    ) -> bool:
        """Inject user input into a running client-facing EventLoopNode.

@@ -444,7 +445,9 @@ class ExecutionStream:
        for executor in self._active_executors.values():
            node = executor.node_registry.get(node_id)
            if node is not None and hasattr(node, "inject_event"):
-                await node.inject_event(content, is_client_input=is_client_input)
+                await node.inject_event(
+                    content, is_client_input=is_client_input, image_content=image_content
+                )
                return True
        return False

@@ -108,7 +108,10 @@ async def handle_chat(request: web.Request) -> web.Response:
    The input box is permanently connected to the queen agent.
    Worker input is handled separately via /worker-input.

-    Body: {"message": "hello"}
+    Body: {"message": "hello", "images": [{"type": "image_url", "image_url": {"url": "data:..."}}]}
+
+    The optional ``images`` field accepts a list of OpenAI-format image_url
+    content blocks.  The frontend encodes images as base64 data URIs.
    """
    session, err = resolve_session(request)
    if err:
@@ -116,15 +119,16 @@ async def handle_chat(request: web.Request) -> web.Response:

    body = await request.json()
    message = body.get("message", "")
+    image_content = body.get("images") or None  # list[dict] | None

-    if not message:
+    if not message and not image_content:
        return web.json_response({"error": "message is required"}, status=400)

    queen_executor = session.queen_executor
    if queen_executor is not None:
        node = queen_executor.node_registry.get("queen")
        if node is not None and hasattr(node, "inject_event"):
-            await node.inject_event(message, is_client_input=True)
+            await node.inject_event(message, is_client_input=True, image_content=image_content)
            # Publish to EventBus so the session event log captures user messages
            from framework.runtime.event_bus import AgentEvent, EventType

@@ -134,7 +138,10 @@ async def handle_chat(request: web.Request) -> web.Response:
                    stream_id="queen",
                    node_id="queen",
                    execution_id=session.id,
-                    data={"content": message},
+                    data={
+                        "content": message,
+                        "image_count": len(image_content) if image_content else 0,
+                    },
                )
            )
            return web.json_response(
@@ -28,6 +28,8 @@ import contextlib
 import json
 import logging
 import shutil
+import subprocess
+import sys
 import time
 from pathlib import Path

@@ -51,8 +53,11 @@ def _get_manager(request: web.Request) -> SessionManager:

 def _session_to_live_dict(session) -> dict:
    """Serialize a live Session to the session-primary JSON shape."""
+    from framework.llm.capabilities import supports_image_tool_results
+
    info = session.worker_info
    phase_state = getattr(session, "phase_state", None)
+    queen_model: str = getattr(getattr(session, "runner", None), "model", "") or ""
    return {
        "session_id": session.id,
        "worker_id": session.worker_id,
@@ -68,6 +73,7 @@ def _session_to_live_dict(session) -> dict:
        "queen_phase": phase_state.phase
        if phase_state
        else ("staging" if session.worker_runtime else "planning"),
+        "queen_supports_images": supports_image_tool_results(queen_model) if queen_model else True,
    }


@@ -978,6 +984,29 @@ async def handle_discover(request: web.Request) -> web.Response:
    return web.json_response(result)


+async def handle_reveal_session_folder(request: web.Request) -> web.Response:
+    """POST /api/sessions/{session_id}/reveal — open session data folder in the OS file manager."""
+    manager: SessionManager = request.app["manager"]
+    session_id = request.match_info["session_id"]
+
+    session = manager.get_session(session_id)
+    storage_session_id = (session.queen_resume_from or session.id) if session else session_id
+    folder = Path.home() / ".hive" / "queen" / "session" / storage_session_id
+    folder.mkdir(parents=True, exist_ok=True)
+
+    try:
+        if sys.platform == "darwin":
+            subprocess.Popen(["open", str(folder)])
+        elif sys.platform == "win32":
+            subprocess.Popen(["explorer", str(folder)])
+        else:
+            subprocess.Popen(["xdg-open", str(folder)])
+    except Exception as exc:
+        return web.json_response({"error": str(exc)}, status=500)
+
+    return web.json_response({"path": str(folder)})
+
+
 # ------------------------------------------------------------------
 # Route registration
 # ------------------------------------------------------------------
@@ -1002,6 +1031,7 @@ def register_routes(app: web.Application) -> None:
    app.router.add_delete("/api/sessions/{session_id}/worker", handle_unload_worker)

    # Session info
+    app.router.add_post("/api/sessions/{session_id}/reveal", handle_reveal_session_folder)
    app.router.add_get("/api/sessions/{session_id}/stats", handle_session_stats)
    app.router.add_get("/api/sessions/{session_id}/entry-points", handle_session_entry_points)
    app.router.add_patch(
@@ -34,8 +34,8 @@ export const executionApi = {
      graph_id: graphId,
    }),

-  chat: (sessionId: string, message: string) =>
-    api.post<ChatResult>(`/sessions/${sessionId}/chat`, { message }),
+  chat: (sessionId: string, message: string, images?: { type: string; image_url: { url: string } }[]) =>
+    api.post<ChatResult>(`/sessions/${sessionId}/chat`, { message, ...(images?.length ? { images } : {}) }),

  /** Queue context for the queen without triggering an LLM response. */
  queenContext: (sessionId: string, message: string) =>
@@ -81,6 +81,10 @@ export const sessionsApi = {
  eventsHistory: (sessionId: string) =>
    api.get<{ events: AgentEvent[]; session_id: string }>(`/sessions/${sessionId}/events/history`),

+  /** Open the session's data folder in the OS file manager. */
+  revealFolder: (sessionId: string) =>
+    api.post<{ path: string }>(`/sessions/${sessionId}/reveal`),
+
  /** List all queen sessions on disk — live + cold (post-restart). */
  history: () =>
    api.get<{ sessions: Array<{ session_id: string; cold: boolean; live: boolean; has_messages: boolean; created_at: number; agent_name?: string | null; agent_path?: string | null }> }>("/sessions/history"),
@@ -14,6 +14,8 @@ export interface LiveSession {
  intro_message?: string;
  /** Queen operating phase — "planning", "building", "staging", or "running" */
  queen_phase?: "planning" | "building" | "staging" | "running";
+  /** Whether the queen's LLM supports image content in messages */
+  queen_supports_images?: boolean;
  /** Present in 409 conflict responses when worker is still loading */
  loading?: boolean;
 }
@@ -1,5 +1,19 @@
 import { memo, useState, useRef, useEffect, useMemo } from "react";
-import { Send, Square, Crown, Cpu, Check, Loader2 } from "lucide-react";
+import {
+  Send,
+  Square,
+  Crown,
+  Cpu,
+  Check,
+  Loader2,
+  Paperclip,
+  X,
+} from "lucide-react";
+
+export interface ImageContent {
+  type: "image_url";
+  image_url: { url: string };
+}

 export interface ContextUsageEntry {
  usagePct: number;
@@ -10,7 +24,9 @@ export interface ContextUsageEntry {
 import MarkdownContent from "@/components/MarkdownContent";
 import QuestionWidget from "@/components/QuestionWidget";
 import MultiQuestionWidget from "@/components/MultiQuestionWidget";
-import ParallelSubagentBubble, { type SubagentGroup } from "@/components/ParallelSubagentBubble";
+import ParallelSubagentBubble, {
+  type SubagentGroup,
+} from "@/components/ParallelSubagentBubble";

 export interface ChatMessage {
  id: string;
@@ -18,7 +34,13 @@ export interface ChatMessage {
  agentColor: string;
  content: string;
  timestamp: string;
-  type?: "system" | "agent" | "user" | "tool_status" | "worker_input_request" | "run_divider";
+  type?:
+    | "system"
+    | "agent"
+    | "user"
+    | "tool_status"
+    | "worker_input_request"
+    | "run_divider";
  role?: "queen" | "worker";
  /** Which worker thread this message belongs to (worker agent name) */
  thread?: string;
@@ -26,6 +48,8 @@ export interface ChatMessage {
  createdAt?: number;
  /** Queen phase active when this message was created */
  phase?: "planning" | "building" | "staging" | "running";
+  /** Images attached to a user message */
+  images?: ImageContent[];
  /** Backend node_id that produced this message — used for subagent grouping */
  nodeId?: string;
  /** Backend execution_id for this message */
@@ -34,7 +58,7 @@ export interface ChatMessage {

 interface ChatPanelProps {
  messages: ChatMessage[];
-  onSend: (message: string, thread: string) => void;
+  onSend: (message: string, thread: string, images?: ImageContent[]) => void;
  isWaiting?: boolean;
  /** When true a worker is thinking (not yet streaming) */
  isWorkerWaiting?: boolean;
@@ -43,6 +67,8 @@ interface ChatPanelProps {
  activeThread: string;
  /** When true, the input is disabled (e.g. during loading) */
  disabled?: boolean;
+  /** When false, the image attach button is hidden (model lacks vision support) */
+  supportsImages?: boolean;
  /** Called when user clicks the stop button to cancel the queen's current turn */
  onCancel?: () => void;
  /** Pending question from ask_user — replaces textarea when present */
@@ -50,7 +76,9 @@ interface ChatPanelProps {
  /** Options for the pending question */
  pendingOptions?: string[] | null;
  /** Multiple questions from ask_user_multiple */
-  pendingQuestions?: { id: string; prompt: string; options?: string[] }[] | null;
+  pendingQuestions?:
+    | { id: string; prompt: string; options?: string[] }[]
+    | null;
  /** Called when user submits an answer to the pending question */
  onQuestionSubmit?: (answer: string, isOther: boolean) => void;
  /** Called when user submits answers to multiple questions */
@@ -86,7 +114,8 @@ const TOOL_HEX = [

 function toolHex(name: string): string {
  let hash = 0;
-  for (let i = 0; i < name.length; i++) hash = (hash * 31 + name.charCodeAt(i)) | 0;
+  for (let i = 0; i < name.length; i++)
+    hash = (hash * 31 + name.charCodeAt(i)) | 0;
  return TOOL_HEX[Math.abs(hash) % TOOL_HEX.length];
 }

@@ -134,12 +163,18 @@ function ToolActivityRow({ content }: { content: string }) {
            <span
              key={`run-${p.name}`}
              className="inline-flex items-center gap-1 text-[11px] px-2.5 py-0.5 rounded-full"
-              style={{ color: hex, backgroundColor: `${hex}18`, border: `1px solid ${hex}35` }}
+              style={{
+                color: hex,
+                backgroundColor: `${hex}18`,
+                border: `1px solid ${hex}35`,
+              }}
            >
              <Loader2 className="w-2.5 h-2.5 animate-spin" />
              {p.name}
              {p.count > 1 && (
-                <span className="text-[10px] font-medium opacity-70">×{p.count}</span>
+                <span className="text-[10px] font-medium opacity-70">
+                  ×{p.count}
+                </span>
              )}
            </span>
          );
@@ -150,7 +185,11 @@ function ToolActivityRow({ content }: { content: string }) {
            <span
              key={`done-${p.name}`}
              className="inline-flex items-center gap-1 text-[11px] px-2.5 py-0.5 rounded-full"
-              style={{ color: hex, backgroundColor: `${hex}18`, border: `1px solid ${hex}35` }}
+              style={{
+                color: hex,
+                backgroundColor: `${hex}18`,
+                border: `1px solid ${hex}35`,
+              }}
            >
              <Check className="w-2.5 h-2.5" />
              {p.name}
@@ -165,103 +204,157 @@ function ToolActivityRow({ content }: { content: string }) {
  );
 }

-const MessageBubble = memo(function MessageBubble({ msg, queenPhase }: { msg: ChatMessage; queenPhase?: "planning" | "building" | "staging" | "running" }) {
-  const isUser = msg.type === "user";
-  const isQueen = msg.role === "queen";
-  const color = getColor(msg.agent, msg.role);
+const MessageBubble = memo(
+  function MessageBubble({
+    msg,
+    queenPhase,
+  }: {
+    msg: ChatMessage;
+    queenPhase?: "planning" | "building" | "staging" | "running";
+  }) {
+    const isUser = msg.type === "user";
+    const isQueen = msg.role === "queen";
+    const color = getColor(msg.agent, msg.role);

-  if (msg.type === "run_divider") {
-    return (
-      <div className="flex items-center gap-3 py-2 my-1">
-        <div className="flex-1 h-px bg-border/60" />
-        <span className="text-[10px] text-muted-foreground font-medium uppercase tracking-wider">
-          {msg.content}
-        </span>
-        <div className="flex-1 h-px bg-border/60" />
-      </div>
-    );
-  }
-
-  if (msg.type === "system") {
-    return (
-      <div className="flex justify-center py-1">
-        <span className="text-[11px] text-muted-foreground bg-muted/60 px-3 py-1.5 rounded-full">
-          {msg.content}
-        </span>
-      </div>
-    );
-  }
-
-  if (msg.type === "tool_status") {
-    return <ToolActivityRow content={msg.content} />;
-  }
-
-  if (isUser) {
-    return (
-      <div className="flex justify-end">
-        <div className="max-w-[75%] bg-primary text-primary-foreground text-sm leading-relaxed rounded-2xl rounded-br-md px-4 py-3">
-          <p className="whitespace-pre-wrap break-words">{msg.content}</p>
+    if (msg.type === "run_divider") {
+      return (
+        <div className="flex items-center gap-3 py-2 my-1">
+          <div className="flex-1 h-px bg-border/60" />
+          <span className="text-[10px] text-muted-foreground font-medium uppercase tracking-wider">
+            {msg.content}
+          </span>
+          <div className="flex-1 h-px bg-border/60" />
        </div>
-      </div>
-    );
-  }
+      );
+    }

-  return (
-    <div className="flex gap-3">
-      <div
-        className={`flex-shrink-0 ${isQueen ? "w-9 h-9" : "w-7 h-7"} rounded-xl flex items-center justify-center`}
-        style={{
-          backgroundColor: `${color}18`,
-          border: `1.5px solid ${color}35`,
-          boxShadow: isQueen ? `0 0 12px ${color}20` : undefined,
-        }}
-      >
-        {isQueen ? (
-          <Crown className="w-4 h-4" style={{ color }} />
-        ) : (
-          <Cpu className="w-3.5 h-3.5" style={{ color }} />
-        )}
-      </div>
-      <div className={`flex-1 min-w-0 ${isQueen ? "max-w-[85%]" : "max-w-[75%]"}`}>
-        <div className="flex items-center gap-2 mb-1">
-          <span className={`font-medium ${isQueen ? "text-sm" : "text-xs"}`} style={{ color }}>
-            {msg.agent}
-          </span>
-          <span
-            className={`text-[10px] font-medium px-1.5 py-0.5 rounded-md ${
-              isQueen ? "bg-primary/15 text-primary" : "bg-muted text-muted-foreground"
-            }`}
-          >
-            {isQueen
-              ? ((msg.phase ?? queenPhase) === "running"
-                ? "running"
-                : (msg.phase ?? queenPhase) === "staging"
-                  ? "staging"
-                  : (msg.phase ?? queenPhase) === "planning"
-                    ? "planning"
-                    : "building")
-              : "Worker"}
+    if (msg.type === "system") {
+      return (
+        <div className="flex justify-center py-1">
+          <span className="text-[11px] text-muted-foreground bg-muted/60 px-3 py-1.5 rounded-full">
+            {msg.content}
          </span>
        </div>
+      );
+    }
+
+    if (msg.type === "tool_status") {
+      return <ToolActivityRow content={msg.content} />;
+    }
+
+    if (isUser) {
+      return (
+        <div className="flex justify-end">
+          <div className="max-w-[75%] bg-primary text-primary-foreground text-sm leading-relaxed rounded-2xl rounded-br-md px-4 py-3">
+            {msg.images && msg.images.length > 0 && (
+              <div className="flex flex-wrap gap-2 mb-2">
+                {msg.images.map((img, i) => (
+                  <img
+                    key={i}
+                    src={img.image_url.url}
+                    alt={`attachment ${i + 1}`}
+                    className="max-h-48 max-w-full rounded-lg object-contain"
+                  />
+                ))}
+              </div>
+            )}
+            {msg.content && (
+              <p className="whitespace-pre-wrap break-words">{msg.content}</p>
+            )}
+          </div>
+        </div>
+      );
+    }
+
+    return (
+      <div className="flex gap-3">
+        <div
+          className={`flex-shrink-0 ${isQueen ? "w-9 h-9" : "w-7 h-7"} rounded-xl flex items-center justify-center`}
+          style={{
+            backgroundColor: `${color}18`,
+            border: `1.5px solid ${color}35`,
+            boxShadow: isQueen ? `0 0 12px ${color}20` : undefined,
+          }}
+        >
+          {isQueen ? (
+            <Crown className="w-4 h-4" style={{ color }} />
+          ) : (
+            <Cpu className="w-3.5 h-3.5" style={{ color }} />
+          )}
+        </div>
        <div
-          className={`text-sm leading-relaxed rounded-2xl rounded-tl-md px-4 py-3 ${
-            isQueen ? "border border-primary/20 bg-primary/5" : "bg-muted/60"
-          }`}
+          className={`flex-1 min-w-0 ${isQueen ? "max-w-[85%]" : "max-w-[75%]"}`}
        >
-          <MarkdownContent content={msg.content} />
+          <div className="flex items-center gap-2 mb-1">
+            <span
+              className={`font-medium ${isQueen ? "text-sm" : "text-xs"}`}
+              style={{ color }}
+            >
+              {msg.agent}
+            </span>
+            <span
+              className={`text-[10px] font-medium px-1.5 py-0.5 rounded-md ${
+                isQueen
+                  ? "bg-primary/15 text-primary"
+                  : "bg-muted text-muted-foreground"
+              }`}
+            >
+              {isQueen
+                ? (msg.phase ?? queenPhase) === "running"
+                  ? "running"
+                  : (msg.phase ?? queenPhase) === "staging"
+                    ? "staging"
+                    : (msg.phase ?? queenPhase) === "planning"
+                      ? "planning"
+                      : "building"
+                : "Worker"}
+            </span>
+          </div>
+          <div
+            className={`text-sm leading-relaxed rounded-2xl rounded-tl-md px-4 py-3 ${
+              isQueen ? "border border-primary/20 bg-primary/5" : "bg-muted/60"
+            }`}
+          >
+            <MarkdownContent content={msg.content} />
+          </div>
        </div>
      </div>
-    </div>
-  );
-}, (prev, next) => prev.msg.id === next.msg.id && prev.msg.content === next.msg.content && prev.msg.phase === next.msg.phase && prev.queenPhase === next.queenPhase);
+    );
+  },
+  (prev, next) =>
+    prev.msg.id === next.msg.id &&
+    prev.msg.content === next.msg.content &&
+    prev.msg.phase === next.msg.phase &&
+    prev.queenPhase === next.queenPhase,
+);

-export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting, isBusy, activeThread, disabled, onCancel, pendingQuestion, pendingOptions, pendingQuestions, onQuestionSubmit, onMultiQuestionSubmit, onQuestionDismiss, queenPhase, contextUsage }: ChatPanelProps) {
+export default function ChatPanel({
+  messages,
+  onSend,
+  isWaiting,
+  isWorkerWaiting,
+  isBusy,
+  activeThread,
+  disabled,
+  onCancel,
+  pendingQuestion,
+  pendingOptions,
+  pendingQuestions,
+  onQuestionSubmit,
+  onMultiQuestionSubmit,
+  onQuestionDismiss,
+  queenPhase,
+  contextUsage,
+  supportsImages = true,
+}: ChatPanelProps) {
  const [input, setInput] = useState("");
+  const [pendingImages, setPendingImages] = useState<ImageContent[]>([]);
  const [readMap, setReadMap] = useState<Record<string, number>>({});
  const bottomRef = useRef<HTMLDivElement>(null);
  const scrollRef = useRef<HTMLDivElement>(null);
  const stickToBottom = useRef(true);
  const textareaRef = useRef<HTMLTextAreaElement>(null);
+  const fileInputRef = useRef<HTMLInputElement>(null);

  const threadMessages = messages.filter((m) => {
    if (m.type === "system" && !m.thread) return false;
@@ -270,7 +363,8 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
    // tool-use-only turns that have no visible text.  During live operation
    // tool pills provide context, but on resume the pills are gone so
    // the empty bubble is meaningless.
-    if (m.role === "queen" && !m.type && (!m.content || !m.content.trim())) return false;
+    if (m.role === "queen" && !m.type && (!m.content || !m.content.trim()))
+      return false;
    return true;
  });

@@ -317,7 +411,8 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
        // Worker message from a non-subagent node means the graph has
        // moved on to the next stage.  Close the bubble even if some
        // subagents are still streaming in the background.
-        if (m.role === "worker" && m.nodeId && !m.nodeId.includes(":subagent:")) break;
+        if (m.role === "worker" && m.nodeId && !m.nodeId.includes(":subagent:"))
+          break;

        // Soft interruption (queen output, system, tool_status without
        // nodeId) — render it normally but keep the subagent run going
@@ -382,31 +477,63 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting

  const handleSubmit = (e: React.FormEvent) => {
    e.preventDefault();
-    if (!input.trim()) return;
-    onSend(input.trim(), activeThread);
+    if (!input.trim() && pendingImages.length === 0) return;
+    onSend(
+      input.trim(),
+      activeThread,
+      pendingImages.length > 0 ? pendingImages : undefined,
+    );
    setInput("");
+    setPendingImages([]);
    if (textareaRef.current) textareaRef.current.style.height = "auto";
  };

+  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
+    const files = Array.from(e.target.files ?? []);
+    if (files.length === 0) return;
+    files.forEach((file) => {
+      const reader = new FileReader();
+      reader.onload = (ev) => {
+        const url = ev.target?.result as string;
+        setPendingImages((prev) => [
+          ...prev,
+          { type: "image_url", image_url: { url } },
+        ]);
+      };
+      reader.readAsDataURL(file);
+    });
+    // Reset so the same file can be re-selected
+    e.target.value = "";
+  };
+
  return (
    <div className="flex flex-col h-full min-w-0">
      {/* Compact sub-header */}
      <div className="px-5 pt-4 pb-2 flex items-center gap-2">
-        <p className="text-[11px] text-muted-foreground font-medium uppercase tracking-wider">Conversation</p>
+        <p className="text-[11px] text-muted-foreground font-medium uppercase tracking-wider">
+          Conversation
+        </p>
      </div>

      {/* Messages */}
-      <div ref={scrollRef} onScroll={handleScroll} className="flex-1 overflow-auto px-5 py-4 space-y-3">
+      <div
+        ref={scrollRef}
+        onScroll={handleScroll}
+        className="flex-1 overflow-auto px-5 py-4 space-y-3"
+      >
        {renderItems.map((item) =>
          item.kind === "parallel" ? (
            <div key={item.groupId}>
-              <ParallelSubagentBubble groupId={item.groupId} groups={item.groups} />
+              <ParallelSubagentBubble
+                groupId={item.groupId}
+                groups={item.groups}
+              />
            </div>
          ) : (
            <div key={item.msg.id}>
              <MessageBubble msg={item.msg} queenPhase={queenPhase} />
            </div>
-          )
+          ),
        )}

        {/* Show typing indicator while waiting for first queen response (disabled + empty chat) */}
@@ -424,9 +551,18 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
            </div>
            <div className="border border-primary/20 bg-primary/5 rounded-2xl rounded-tl-md px-4 py-3">
              <div className="flex gap-1.5">
-                <span className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce" style={{ animationDelay: "0ms" }} />
-                <span className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce" style={{ animationDelay: "150ms" }} />
-                <span className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce" style={{ animationDelay: "300ms" }} />
+                <span
+                  className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce"
+                  style={{ animationDelay: "0ms" }}
+                />
+                <span
+                  className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce"
+                  style={{ animationDelay: "150ms" }}
+                />
+                <span
+                  className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce"
+                  style={{ animationDelay: "300ms" }}
+                />
              </div>
            </div>
          </div>
@@ -444,9 +580,18 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
            </div>
            <div className="bg-muted/60 rounded-2xl rounded-tl-md px-4 py-3">
              <div className="flex gap-1.5">
-                <span className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce" style={{ animationDelay: "0ms" }} />
-                <span className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce" style={{ animationDelay: "150ms" }} />
-                <span className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce" style={{ animationDelay: "300ms" }} />
+                <span
+                  className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce"
+                  style={{ animationDelay: "0ms" }}
+                />
+                <span
+                  className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce"
+                  style={{ animationDelay: "150ms" }}
+                />
+                <span
+                  className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce"
+                  style={{ animationDelay: "300ms" }}
+                />
              </div>
            </div>
          </div>
@@ -458,46 +603,84 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
      {(() => {
        if (!contextUsage) return null;
        const queenUsage = contextUsage["__queen__"];
-        const workerEntries = Object.entries(contextUsage).filter(([k]) => k !== "__queen__");
-        const workerUsage = workerEntries.length > 0
-          ? workerEntries.reduce((best, [, v]) => (v.usagePct > best.usagePct ? v : best), workerEntries[0][1])
-          : undefined;
+        const workerEntries = Object.entries(contextUsage).filter(
+          ([k]) => k !== "__queen__",
+        );
+        const workerUsage =
+          workerEntries.length > 0
+            ? workerEntries.reduce(
+                (best, [, v]) => (v.usagePct > best.usagePct ? v : best),
+                workerEntries[0][1],
+              )
+            : undefined;
        if (!queenUsage && !workerUsage) return null;
        return (
          <div className="flex items-center gap-3 mx-4 px-3 py-1 rounded-lg bg-muted/30 border border-border/20 group/ctx flex-shrink-0">
            {queenUsage && (
-              <div className="flex items-center gap-2 flex-1 min-w-0" title={`Queen: ${(queenUsage.estimatedTokens / 1000).toFixed(1)}k / ${(queenUsage.maxTokens / 1000).toFixed(0)}k tokens \u00b7 ${queenUsage.messageCount} messages`}>
-                <Crown className="w-3 h-3 flex-shrink-0" style={{ color: "hsl(45,95%,58%)" }} />
+              <div
+                className="flex items-center gap-2 flex-1 min-w-0"
+                title={`Queen: ${(queenUsage.estimatedTokens / 1000).toFixed(1)}k / ${(queenUsage.maxTokens / 1000).toFixed(0)}k tokens \u00b7 ${queenUsage.messageCount} messages`}
+              >
+                <Crown
+                  className="w-3 h-3 flex-shrink-0"
+                  style={{ color: "hsl(45,95%,58%)" }}
+                />
                <div className="flex-1 h-1.5 rounded-full bg-muted/50 overflow-hidden min-w-[60px]">
                  <div
                    className="h-full rounded-full transition-all duration-500 ease-out"
                    style={{
                      width: `${Math.min(queenUsage.usagePct, 100)}%`,
-                      backgroundColor: queenUsage.usagePct >= 90 ? "hsl(0,65%,55%)" : queenUsage.usagePct >= 70 ? "hsl(35,90%,55%)" : "hsl(45,95%,58%)",
+                      backgroundColor:
+                        queenUsage.usagePct >= 90
+                          ? "hsl(0,65%,55%)"
+                          : queenUsage.usagePct >= 70
+                            ? "hsl(35,90%,55%)"
+                            : "hsl(45,95%,58%)",
                    }}
                  />
                </div>
                <span className="text-[10px] text-muted-foreground/70 flex-shrink-0 tabular-nums">
-                  <span className="group-hover/ctx:hidden">{queenUsage.usagePct}%</span>
-                  <span className="hidden group-hover/ctx:inline">{(queenUsage.estimatedTokens / 1000).toFixed(1)}k / {(queenUsage.maxTokens / 1000).toFixed(0)}k</span>
+                  <span className="group-hover/ctx:hidden">
+                    {queenUsage.usagePct}%
+                  </span>
+                  <span className="hidden group-hover/ctx:inline">
+                    {(queenUsage.estimatedTokens / 1000).toFixed(1)}k /{" "}
+                    {(queenUsage.maxTokens / 1000).toFixed(0)}k
+                  </span>
                </span>
              </div>
            )}
            {workerUsage && (
-              <div className="flex items-center gap-2 flex-1 min-w-0" title={`Worker: ${(workerUsage.estimatedTokens / 1000).toFixed(1)}k / ${(workerUsage.maxTokens / 1000).toFixed(0)}k tokens \u00b7 ${workerUsage.messageCount} messages`}>
-                <Cpu className="w-3 h-3 flex-shrink-0" style={{ color: "hsl(220,60%,55%)" }} />
+              <div
+                className="flex items-center gap-2 flex-1 min-w-0"
+                title={`Worker: ${(workerUsage.estimatedTokens / 1000).toFixed(1)}k / ${(workerUsage.maxTokens / 1000).toFixed(0)}k tokens \u00b7 ${workerUsage.messageCount} messages`}
+              >
+                <Cpu
+                  className="w-3 h-3 flex-shrink-0"
+                  style={{ color: "hsl(220,60%,55%)" }}
+                />
                <div className="flex-1 h-1.5 rounded-full bg-muted/50 overflow-hidden min-w-[60px]">
                  <div
                    className="h-full rounded-full transition-all duration-500 ease-out"
                    style={{
                      width: `${Math.min(workerUsage.usagePct, 100)}%`,
-                      backgroundColor: workerUsage.usagePct >= 90 ? "hsl(0,65%,55%)" : workerUsage.usagePct >= 70 ? "hsl(35,90%,55%)" : "hsl(220,60%,55%)",
+                      backgroundColor:
+                        workerUsage.usagePct >= 90
+                          ? "hsl(0,65%,55%)"
+                          : workerUsage.usagePct >= 70
+                            ? "hsl(35,90%,55%)"
+                            : "hsl(220,60%,55%)",
                    }}
                  />
                </div>
                <span className="text-[10px] text-muted-foreground/70 flex-shrink-0 tabular-nums">
-                  <span className="group-hover/ctx:hidden">{workerUsage.usagePct}%</span>
-                  <span className="hidden group-hover/ctx:inline">{(workerUsage.estimatedTokens / 1000).toFixed(1)}k / {(workerUsage.maxTokens / 1000).toFixed(0)}k</span>
+                  <span className="group-hover/ctx:hidden">
+                    {workerUsage.usagePct}%
+                  </span>
+                  <span className="hidden group-hover/ctx:inline">
+                    {(workerUsage.estimatedTokens / 1000).toFixed(1)}k /{" "}
+                    {(workerUsage.maxTokens / 1000).toFixed(0)}k
+                  </span>
                </span>
              </div>
            )}
@@ -506,7 +689,9 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
      })()}

      {/* Input area — question widget replaces textarea when a question is pending */}
-      {pendingQuestions && pendingQuestions.length >= 2 && onMultiQuestionSubmit ? (
+      {pendingQuestions &&
+      pendingQuestions.length >= 2 &&
+      onMultiQuestionSubmit ? (
        <MultiQuestionWidget
          questions={pendingQuestions}
          onSubmit={onMultiQuestionSubmit}
@@ -521,7 +706,47 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
        />
      ) : (
        <form onSubmit={handleSubmit} className="p-4">
+          {/* Image preview strip */}
+          {pendingImages.length > 0 && (
+            <div className="flex flex-wrap gap-2 mb-2 px-1">
+              {pendingImages.map((img, i) => (
+                <div key={i} className="relative group">
+                  <img
+                    src={img.image_url.url}
+                    alt={`preview ${i + 1}`}
+                    className="h-16 w-16 object-cover rounded-lg border border-border"
+                  />
+                  <button
+                    type="button"
+                    onClick={() =>
+                      setPendingImages((prev) => prev.filter((_, j) => j !== i))
+                    }
+                    className="absolute -top-1.5 -right-1.5 w-4 h-4 rounded-full bg-destructive text-destructive-foreground flex items-center justify-center opacity-0 group-hover:opacity-100 transition-opacity"
+                  >
+                    <X className="w-2.5 h-2.5" />
+                  </button>
+                </div>
+              ))}
+            </div>
+          )}
          <div className="flex items-center gap-3 bg-muted/40 rounded-xl px-4 py-2.5 border border-border focus-within:border-primary/40 transition-colors">
+            <input
+              ref={fileInputRef}
+              type="file"
+              accept="image/*"
+              multiple
+              className="hidden"
+              onChange={handleFileChange}
+            />
+            <button
+              type="button"
+              disabled={disabled || !supportsImages}
+              onClick={() => supportsImages && fileInputRef.current?.click()}
+              className="flex-shrink-0 p-1 rounded-md text-muted-foreground hover:text-foreground disabled:opacity-30 transition-colors"
+              title={supportsImages ? "Attach image" : "Image not supported by the current model"}
+            >
+              <Paperclip className="w-4 h-4" />
+            </button>
            <textarea
              ref={textareaRef}
              rows={1}
@@ -538,7 +763,9 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
                  handleSubmit(e);
                }
              }}
-              placeholder={disabled ? "Connecting to agent..." : "Message Queen Bee..."}
+              placeholder={
+                disabled ? "Connecting to agent..." : "Message Queen Bee..."
+              }
              disabled={disabled}
              className="flex-1 bg-transparent text-sm text-foreground outline-none placeholder:text-muted-foreground disabled:opacity-50 disabled:cursor-not-allowed resize-none overflow-y-auto"
            />
@@ -553,7 +780,9 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
            ) : (
              <button
                type="submit"
-                disabled={!input.trim() || disabled}
+                disabled={
+                  (!input.trim() && pendingImages.length === 0) || disabled
+                }
                className="p-2 rounded-lg bg-primary text-primary-foreground disabled:opacity-30 hover:opacity-90 transition-opacity"
              >
                <Send className="w-4 h-4" />
@@ -1,7 +1,7 @@
 import { useState, useCallback, useRef, useEffect, useMemo } from "react";
 import ReactDOM from "react-dom";
 import { useSearchParams, useNavigate } from "react-router-dom";
-import { Plus, KeyRound, Sparkles, Layers, ChevronLeft, Bot, Loader2, WifiOff, X } from "lucide-react";
+import { Plus, KeyRound, Sparkles, Layers, ChevronLeft, Bot, Loader2, WifiOff, X, FolderOpen } from "lucide-react";
 import type { GraphNode, NodeStatus } from "@/components/graph-types";
 import DraftGraph from "@/components/DraftGraph";
 import ChatPanel, { type ChatMessage } from "@/components/ChatPanel";
@@ -354,6 +354,8 @@ interface AgentBackendState {
  pendingQuestionSource: "queen" | "worker" | null;
  /** Per-node context window usage (from context_usage_updated events) */
  contextUsage: Record<string, { usagePct: number; messageCount: number; estimatedTokens: number; maxTokens: number }>;
+  /** Whether the queen's LLM supports image content — false disables the attach button */
+  queenSupportsImages: boolean;
 }

 function defaultAgentState(): AgentBackendState {
@@ -392,6 +394,7 @@ function defaultAgentState(): AgentBackendState {
    pendingQuestions: null,
    pendingQuestionSource: null,
    contextUsage: {},
+    queenSupportsImages: true,
  };
 }

@@ -923,6 +926,7 @@ export default function Workspace() {
          queenReady: true,
          queenPhase: qPhase,
          queenBuilding: qPhase === "building",
+          queenSupportsImages: liveSession.queen_supports_images !== false,
          // Restore flowchart overlay from persisted events
          ...(restoredFlowchartMap ? { flowchartMap: restoredFlowchartMap } : {}),
          ...(restoredOriginalDraft ? { originalDraft: restoredOriginalDraft, draftGraph: null } : {}),
@@ -1122,6 +1126,7 @@ export default function Workspace() {
        displayName,
        queenPhase: initialPhase,
        queenBuilding: initialPhase === "building",
+        queenSupportsImages: session.queen_supports_images !== false,
        // Restore flowchart overlay from persisted events
        ...(restoredFlowchartMap ? { flowchartMap: restoredFlowchartMap } : {}),
        ...(restoredOriginalDraft ? { originalDraft: restoredOriginalDraft, draftGraph: null } : {}),
@@ -2613,7 +2618,7 @@ export default function Workspace() {
    });

  // --- handleSend ---
-  const handleSend = useCallback((text: string, thread: string) => {
+  const handleSend = useCallback((text: string, thread: string, images?: import("@/components/ChatPanel").ImageContent[]) => {
    if (!activeSession) return;
    const state = agentStates[activeWorker];

@@ -2679,6 +2684,7 @@ export default function Workspace() {
    const userMsg: ChatMessage = {
      id: makeId(), agent: "You", agentColor: "",
      content: text, timestamp: "", type: "user", thread, createdAt: Date.now(),
+      images,
    };
    setSessionsByAgent(prev => ({
      ...prev,
@@ -2690,7 +2696,7 @@ export default function Workspace() {
    updateAgentState(activeWorker, { isTyping: true, queenIsTyping: true });

    if (state?.sessionId && state?.ready) {
-      executionApi.chat(state.sessionId, text).catch((err: unknown) => {
+      executionApi.chat(state.sessionId, text, images).catch((err: unknown) => {
        const errMsg = err instanceof Error ? err.message : String(err);
        const errorChatMsg: ChatMessage = {
          id: makeId(), agent: "System", agentColor: "",
@@ -3106,6 +3112,16 @@ export default function Workspace() {
          <KeyRound className="w-3.5 h-3.5" />
          Credentials
        </button>
+        {activeAgentState?.sessionId && (
+          <button
+            onClick={() => sessionsApi.revealFolder(activeAgentState.sessionId!).catch(() => {})}
+            className="flex items-center gap-1.5 px-3 py-1.5 rounded-md text-xs font-medium text-muted-foreground hover:text-foreground hover:bg-muted/50 transition-colors flex-shrink-0"
+            title="Open session data folder"
+          >
+            <FolderOpen className="w-3.5 h-3.5" />
+            Data
+          </button>
+        )}
      </TopBar>

      {/* Main content area */}
@@ -3224,6 +3240,7 @@ export default function Workspace() {
                onMultiQuestionSubmit={handleMultiQuestionAnswer}
                onQuestionDismiss={handleQuestionDismiss}
                contextUsage={activeAgentState?.contextUsage}
+                supportsImages={activeAgentState?.queenSupportsImages ?? true}
              />
            )}
          </div>
@@ -0,0 +1,58 @@
+"""Tests for LLM model capability checks."""
+
+from __future__ import annotations
+
+import pytest
+
+from framework.llm.capabilities import supports_image_tool_results
+
+
+class TestSupportsImageToolResults:
+    """Verify the deny-list correctly identifies models that can't handle images."""
+
+    @pytest.mark.parametrize(
+        "model",
+        [
+            "gpt-4o",
+            "gpt-4o-mini",
+            "gpt-4-turbo",
+            "openai/gpt-4o",
+            "anthropic/claude-sonnet-4-20250514",
+            "claude-haiku-4-5-20251001",
+            "gemini/gemini-1.5-pro",
+            "google/gemini-1.5-flash",
+            "mistral/mistral-large",
+            "groq/llama3-70b",
+            "together/meta-llama/Llama-3-70b",
+            "fireworks_ai/llama-v3-70b",
+            "azure/gpt-4o",
+            "kimi/claude-sonnet-4-20250514",
+            "hive/claude-sonnet-4-20250514",
+        ],
+    )
+    def test_supported_models(self, model: str):
+        assert supports_image_tool_results(model) is True
+
+    @pytest.mark.parametrize(
+        "model",
+        [
+            "deepseek/deepseek-chat",
+            "deepseek/deepseek-coder",
+            "deepseek-chat",
+            "deepseek-reasoner",
+            "ollama/llama3",
+            "ollama/mistral",
+            "ollama_chat/llama3",
+            "lm_studio/my-model",
+            "vllm/meta-llama/Llama-3-70b",
+            "llamacpp/model",
+            "cerebras/llama3-70b",
+        ],
+    )
+    def test_unsupported_models(self, model: str):
+        assert supports_image_tool_results(model) is False
+
+    def test_case_insensitive(self):
+        assert supports_image_tool_results("DeepSeek/deepseek-chat") is False
+        assert supports_image_tool_results("OLLAMA/llama3") is False
+        assert supports_image_tool_results("GPT-4o") is True
@@ -48,6 +48,9 @@ dev = [
 sandbox = [
    "RestrictedPython>=7.0",
 ]
+browser = [
+    "pillow>=10.0.0",
+]
 ocr = [
    "pytesseract>=0.3.10",
    "pillow>=10.0.0",
@@ -0,0 +1,192 @@
+"""Ref system for aria snapshots.
+
+Assigns short `[ref=eN]` markers to interactive elements in Playwright's
+aria_snapshot() output so the LLM can reference elements by ref instead of
+constructing fragile CSS selectors.
+
+Usage:
+    annotated, ref_map = annotate_snapshot(raw_snapshot)
+    # ... later, when the LLM says selector="e5" ...
+    playwright_selector = resolve_ref("e5", ref_map)
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from .session import BrowserSession
+
+# ---------------------------------------------------------------------------
+# Role sets (matching Playwright's aria roles that matter for interaction)
+# ---------------------------------------------------------------------------
+
+INTERACTIVE_ROLES: frozenset[str] = frozenset(
+    {
+        "button",
+        "checkbox",
+        "combobox",
+        "link",
+        "listbox",
+        "menuitem",
+        "menuitemcheckbox",
+        "menuitemradio",
+        "option",
+        "radio",
+        "scrollbar",
+        "searchbox",
+        "slider",
+        "spinbutton",
+        "switch",
+        "tab",
+        "textbox",
+        "treeitem",
+    }
+)
+
+NAMED_CONTENT_ROLES: frozenset[str] = frozenset(
+    {
+        "cell",
+        "heading",
+        "img",
+    }
+)
+
+# Regex: captures indent, role, optional quoted name, and trailing text.
+# Example line:  "  - button \"Submit\" [disabled]"
+#   group(1)=indent "  ", group(2)=role "button",
+#   group(3)=name "Submit" (or None), group(4)=rest " [disabled]"
+_LINE_RE = re.compile(r"^(\s*-\s+)(\w+)(?:\s+\"([^\"]*)\")?(.*?)$")
+
+# ---------------------------------------------------------------------------
+# Data types
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class RefEntry:
+    """A single ref entry mapping to a Playwright role selector."""
+
+    role: str
+    name: str | None
+    nth: int
+
+
+# ref_id (e.g. "e0") -> RefEntry
+RefMap = dict[str, RefEntry]
+
+# ---------------------------------------------------------------------------
+# annotate_snapshot
+# ---------------------------------------------------------------------------
+
+
+def annotate_snapshot(snapshot: str) -> tuple[str, RefMap]:
+    """Inject ``[ref=eN]`` markers into an aria snapshot.
+
+    Returns:
+        (annotated_text, ref_map) where ref_map maps ref ids to RefEntry.
+    """
+    lines = snapshot.split("\n")
+
+    # First pass: identify which lines get refs and count (role, name) pairs
+    # for nth disambiguation.
+    candidates: list[tuple[int, str, str | None]] = []  # (line_idx, role, name)
+
+    for i, line in enumerate(lines):
+        m = _LINE_RE.match(line)
+        if not m:
+            continue
+        role = m.group(2)
+        name = m.group(3)  # None if no quoted name
+
+        if role in INTERACTIVE_ROLES or (role in NAMED_CONTENT_ROLES and name):
+            candidates.append((i, role, name))
+
+    # Second pass: assign refs with nth indices.
+    ref_map: RefMap = {}
+    pair_seen: dict[tuple[str, str | None], int] = {}
+    ref_counter = 0
+
+    for line_idx, role, name in candidates:
+        key = (role, name)
+        nth = pair_seen.get(key, 0)
+        pair_seen[key] = nth + 1
+
+        ref_id = f"e{ref_counter}"
+        ref_counter += 1
+
+        ref_map[ref_id] = RefEntry(role=role, name=name, nth=nth)
+
+        # Inject [ref=eN] at end of line (before any trailing whitespace)
+        lines[line_idx] = lines[line_idx].rstrip() + f" [ref={ref_id}]"
+
+    return "\n".join(lines), ref_map
+
+
+# ---------------------------------------------------------------------------
+# resolve_ref
+# ---------------------------------------------------------------------------
+
+_REF_PATTERN = re.compile(r"^e\d+$")
+
+
+def resolve_ref(selector: str, ref_map: RefMap | None) -> str:
+    """Resolve a ref id (e.g. ``"e5"``) to a Playwright role selector.
+
+    If *selector* doesn't look like a ref (``e\\d+``), it's returned as-is
+    so that plain CSS selectors keep working.
+
+    Raises:
+        ValueError: If the ref is not found or no snapshot has been taken.
+    """
+    if not _REF_PATTERN.match(selector):
+        return selector  # Pass through CSS / XPath / role selectors
+
+    if ref_map is None:
+        raise ValueError(
+            f"Ref '{selector}' used but no snapshot has been taken yet. "
+            "Call browser_snapshot first."
+        )
+
+    entry = ref_map.get(selector)
+    if entry is None:
+        valid = ", ".join(sorted(ref_map.keys(), key=lambda k: int(k[1:])))
+        raise ValueError(
+            f"Ref '{selector}' not found. Valid refs: {valid}. "
+            "The page may have changed — take a new snapshot."
+        )
+
+    # Build Playwright role selector
+    if entry.name is not None:
+        escaped_name = entry.name.replace("\\", "\\\\").replace('"', '\\"')
+        sel = f'role={entry.role}[name="{escaped_name}"]'
+    else:
+        sel = f"role={entry.role}"
+
+    # Always include nth to disambiguate
+    sel += f" >> nth={entry.nth}"
+    return sel
+
+
+# ---------------------------------------------------------------------------
+# Convenience wrapper
+# ---------------------------------------------------------------------------
+
+
+def resolve_selector(
+    selector: str,
+    session: BrowserSession,
+    target_id: str | None,
+) -> str:
+    """Resolve a selector that might be a ref, using the session's ref maps.
+
+    Args:
+        selector: A CSS selector or ref id (e.g. ``"e5"``).
+        session: The current BrowserSession.
+        target_id: The target page id (falls back to session.active_page_id).
+    """
+    tid = target_id or session.active_page_id
+    ref_map = session.ref_maps.get(tid) if tid else None
+    return resolve_ref(selector, ref_map)
@@ -353,6 +353,7 @@ class BrowserSession:
    active_page_id: str | None = None
    console_messages: dict[str, list[dict]] = field(default_factory=dict)
    page_meta: dict[str, TabMeta] = field(default_factory=dict)
+    ref_maps: dict[str, dict] = field(default_factory=dict)  # target_id → RefMap
    _playwright: Any = None
    _lock: asyncio.Lock = field(default_factory=asyncio.Lock)

@@ -447,6 +448,7 @@ class BrowserSession:
        self.active_page_id = None
        self.console_messages.clear()
        self.page_meta.clear()
+        self.ref_maps.clear()

    async def start(self, headless: bool = True, persistent: bool = True) -> dict:
        """
@@ -623,6 +625,7 @@ class BrowserSession:
            self.active_page_id = None
            self.console_messages.clear()
            self.page_meta.clear()
+            self.ref_maps.clear()
            self.user_data_dir = None
            self.persistent = False

@@ -801,6 +804,7 @@ class BrowserSession:
        self.pages.pop(target_id, None)
        self.console_messages.pop(target_id, None)
        self.page_meta.pop(target_id, None)
+        self.ref_maps.pop(target_id, None)

        if self.active_page_id == target_id:
            self.active_page_id = next(iter(self.pages), None)
@@ -16,6 +16,7 @@ from playwright.async_api import (
 )

 from ..highlight import highlight_element
+from ..refs import resolve_selector
 from ..session import DEFAULT_TIMEOUT_MS, get_session


@@ -52,6 +53,10 @@ def register_advanced_tools(mcp: FastMCP) -> None:
                return {"ok": False, "error": "No active tab"}

            if selector:
+                try:
+                    selector = resolve_selector(selector, session, target_id)
+                except ValueError as e:
+                    return {"ok": False, "error": str(e)}
                await page.wait_for_selector(selector, timeout=timeout_ms)
                return {"ok": True, "action": "wait", "condition": "selector", "selector": selector}
            elif text:
@@ -122,6 +127,11 @@ def register_advanced_tools(mcp: FastMCP) -> None:
            if not page:
                return {"ok": False, "error": "No active tab"}

+            try:
+                selector = resolve_selector(selector, session, target_id)
+            except ValueError as e:
+                return {"ok": False, "error": str(e)}
+
            element = await page.wait_for_selector(selector, timeout=timeout_ms)
            if not element:
                return {"ok": False, "error": f"Element not found: {selector}"}
@@ -160,6 +170,11 @@ def register_advanced_tools(mcp: FastMCP) -> None:
            if not page:
                return {"ok": False, "error": "No active tab"}

+            try:
+                selector = resolve_selector(selector, session, target_id)
+            except ValueError as e:
+                return {"ok": False, "error": str(e)}
+
            element = await page.wait_for_selector(selector, timeout=timeout_ms)
            if not element:
                return {"ok": False, "error": f"Element not found: {selector}"}
@@ -238,6 +253,11 @@ def register_advanced_tools(mcp: FastMCP) -> None:
                if not Path(path).exists():
                    return {"ok": False, "error": f"File not found: {path}"}

+            try:
+                selector = resolve_selector(selector, session, target_id)
+            except ValueError as e:
+                return {"ok": False, "error": str(e)}
+
            await highlight_element(page, selector)

            element = await page.wait_for_selector(selector, timeout=timeout_ms)
@@ -7,14 +7,113 @@ Tools for extracting content and capturing page state.
 from __future__ import annotations

 import base64
+import io
+import json
+import logging
 from pathlib import Path
 from typing import Any, Literal

 from fastmcp import FastMCP
+from mcp.types import ImageContent, TextContent
 from playwright.async_api import Error as PlaywrightError

 from ..session import get_session

+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Screenshot normalization
+# ---------------------------------------------------------------------------
+
+_QUALITY_STEPS = (85, 70, 50)
+_MIN_DIMENSION = 400
+_DIMENSION_STEP = 200
+
+
+def _normalize_screenshot(
+    raw_bytes: bytes,
+    image_type: str,
+    *,
+    max_dimension: int = 2000,
+    max_bytes: int = 5_000_000,
+) -> tuple[bytes, str]:
+    """Normalize a screenshot to fit within size and dimension limits.
+
+    Progressively resizes and compresses to JPEG until the image fits
+    under *max_bytes* and *max_dimension*.  If Pillow is not installed
+    the original bytes are returned unchanged.
+
+    Args:
+        raw_bytes: Raw PNG or JPEG image bytes from Playwright.
+        image_type: Original format (``"png"`` or ``"jpeg"``).
+        max_dimension: Maximum width or height in pixels.
+        max_bytes: Maximum file size in bytes.
+
+    Returns:
+        ``(normalized_bytes, image_type)`` where *image_type* may change
+        to ``"jpeg"`` if compression was applied.
+    """
+    try:
+        from PIL import Image
+    except ImportError:
+        logger.debug("Pillow not installed — skipping screenshot normalization")
+        return raw_bytes, image_type
+
+    try:
+        img = Image.open(io.BytesIO(raw_bytes))
+        width, height = img.size
+        max_dim = max(width, height)
+
+        # Already within limits — return as-is
+        if len(raw_bytes) <= max_bytes and max_dim <= max_dimension:
+            return raw_bytes, image_type
+
+        # Build candidate dimensions (descending), skip anything >= original
+        candidates = [
+            d for d in range(max_dimension, _MIN_DIMENSION - 1, -_DIMENSION_STEP) if d < max_dim
+        ]
+        # If the original is already <= max_dimension but over max_bytes,
+        # still try compressing at original size first.
+        if max_dim <= max_dimension:
+            candidates = [max_dim] + candidates
+
+        smallest: tuple[bytes, int] | None = None
+
+        for side in candidates:
+            # Re-open from source each iteration (thumbnail is destructive)
+            img = Image.open(io.BytesIO(raw_bytes))
+            img.thumbnail((side, side), Image.LANCZOS)
+
+            # JPEG doesn't support alpha
+            if img.mode in ("RGBA", "LA", "P"):
+                img = img.convert("RGB")
+
+            for quality in _QUALITY_STEPS:
+                buf = io.BytesIO()
+                img.save(buf, format="JPEG", quality=quality, optimize=True)
+                out_bytes = buf.getvalue()
+
+                if smallest is None or len(out_bytes) < smallest[1]:
+                    smallest = (out_bytes, len(out_bytes))
+
+                if len(out_bytes) <= max_bytes:
+                    return out_bytes, "jpeg"
+
+        # Nothing fit — return the smallest we produced
+        if smallest is not None:
+            logger.warning(
+                "Screenshot normalization: could not fit under %d bytes (best: %d bytes)",
+                max_bytes,
+                smallest[1],
+            )
+            return smallest[0], "jpeg"
+
+        return raw_bytes, image_type
+
+    except Exception:
+        logger.warning("Screenshot normalization failed — returning original", exc_info=True)
+        return raw_bytes, image_type
+

 def _format_ax_tree(nodes: list[dict[str, Any]]) -> str:
    """Format a CDP Accessibility.getFullAXTree result into an indented text tree.
@@ -102,10 +201,13 @@ def register_inspection_tools(mcp: FastMCP) -> None:
        full_page: bool = False,
        selector: str | None = None,
        image_type: Literal["png", "jpeg"] = "png",
-    ) -> dict:
+    ) -> list:
        """
        Take a screenshot of the current page.

+        Returns the screenshot as an image the LLM can see, alongside
+        text metadata (URL, size, etc.).
+
        Args:
            target_id: Tab ID (default: active tab)
            profile: Browser profile name (default: "default")
@@ -114,18 +216,32 @@ def register_inspection_tools(mcp: FastMCP) -> None:
            image_type: Image format - png or jpeg (default: png)

        Returns:
-            Dict with screenshot data (base64 encoded) and metadata
+            List of content blocks: text metadata + image
        """
        try:
            session = get_session(profile)
            page = session.get_page(target_id)
            if not page:
-                return {"ok": False, "error": "No active tab"}
+                return [
+                    TextContent(
+                        type="text", text=json.dumps({"ok": False, "error": "No active tab"})
+                    )
+                ]

            if selector:
+                from ..refs import resolve_selector
+
+                selector = resolve_selector(selector, session, target_id)
                element = await page.query_selector(selector)
                if not element:
-                    return {"ok": False, "error": f"Element not found: {selector}"}
+                    return [
+                        TextContent(
+                            type="text",
+                            text=json.dumps(
+                                {"ok": False, "error": f"Element not found: {selector}"}
+                            ),
+                        )
+                    ]
                screenshot_bytes = await element.screenshot(type=image_type)
            else:
                screenshot_bytes = await page.screenshot(
@@ -133,16 +249,31 @@ def register_inspection_tools(mcp: FastMCP) -> None:
                    type=image_type,
                )

-            return {
-                "ok": True,
-                "targetId": target_id or session.active_page_id,
-                "url": page.url,
-                "imageType": image_type,
-                "imageBase64": base64.b64encode(screenshot_bytes).decode(),
-                "size": len(screenshot_bytes),
-            }
+            normalized_bytes, normalized_type = _normalize_screenshot(screenshot_bytes, image_type)
+            meta = json.dumps(
+                {
+                    "ok": True,
+                    "targetId": target_id or session.active_page_id,
+                    "url": page.url,
+                    "imageType": normalized_type,
+                    "size": len(normalized_bytes),
+                    "originalSize": len(screenshot_bytes),
+                }
+            )
+            return [
+                TextContent(type="text", text=meta),
+                ImageContent(
+                    type="image",
+                    data=base64.b64encode(normalized_bytes).decode(),
+                    mimeType=f"image/{normalized_type}",
+                ),
+            ]
        except PlaywrightError as e:
-            return {"ok": False, "error": f"Browser error: {e!s}"}
+            return [
+                TextContent(
+                    type="text", text=json.dumps({"ok": False, "error": f"Browser error: {e!s}"})
+                )
+            ]

    @mcp.tool()
    async def browser_snapshot(
@@ -196,6 +327,13 @@ def register_inspection_tools(mcp: FastMCP) -> None:
                    await cdp.detach()
            else:
                snapshot = await page.locator(":root").aria_snapshot()
+                # Annotate with [ref=eN] markers for interactive elements
+                from ..refs import annotate_snapshot
+
+                snapshot, ref_map = annotate_snapshot(snapshot)
+                tid = target_id or session.active_page_id
+                if tid:
+                    session.ref_maps[tid] = ref_map

            return {
                "ok": True,
@@ -17,7 +17,8 @@ from playwright.async_api import (
 )

 from ..highlight import highlight_coordinate, highlight_element
-from ..session import DEFAULT_TIMEOUT_MS, get_session
+from ..refs import annotate_snapshot, resolve_selector
+from ..session import DEFAULT_TIMEOUT_MS, BrowserSession, get_session

 logger = logging.getLogger(__name__)

@@ -27,6 +28,8 @@ _AUTO_SNAPSHOT_MAX_CHARS = 4000
 async def _auto_snapshot(
    page: Page,
    *,
+    session: BrowserSession | None = None,
+    target_id: str | None = None,
    wait_for_nav: bool = False,
    max_chars: int = _AUTO_SNAPSHOT_MAX_CHARS,
 ) -> str | None:
@@ -34,6 +37,8 @@ async def _auto_snapshot(

    Args:
        page: Playwright Page instance.
+        session: BrowserSession to store ref maps in.
+        target_id: Target page id for ref map storage.
        wait_for_nav: If True, briefly wait for any in-flight navigation to
            settle before snapshotting.  Used after click actions that may
            trigger page navigation.
@@ -48,6 +53,14 @@ async def _auto_snapshot(
            except Exception:
                pass  # No navigation happened — that's fine
        snapshot = await page.locator(":root").aria_snapshot()
+
+        # Annotate with refs before truncation so the full RefMap is captured
+        if snapshot and session:
+            snapshot, ref_map = annotate_snapshot(snapshot)
+            tid = target_id or session.active_page_id
+            if tid:
+                session.ref_maps[tid] = ref_map
+
        if snapshot and max_chars > 0 and len(snapshot) > max_chars:
            snapshot = (
                snapshot[:max_chars]
@@ -96,6 +109,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            if not page:
                return {"ok": False, "error": "No active tab"}

+            try:
+                selector = resolve_selector(selector, session, target_id)
+            except ValueError as e:
+                return {"ok": False, "error": str(e)}
+
            await highlight_element(page, selector)

            if double_click:
@@ -105,7 +123,12 @@ def register_interaction_tools(mcp: FastMCP) -> None:

            result: dict = {"ok": True, "action": "click", "selector": selector}
            if auto_snapshot:
-                snapshot = await _auto_snapshot(page, wait_for_nav=True)
+                snapshot = await _auto_snapshot(
+                    page,
+                    session=session,
+                    target_id=target_id,
+                    wait_for_nav=True,
+                )
                if snapshot:
                    result["snapshot"] = snapshot
                    result["url"] = page.url
@@ -151,7 +174,12 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            await page.mouse.click(x, y, button=button)
            result: dict = {"ok": True, "action": "click_coordinate", "x": x, "y": y}
            if auto_snapshot:
-                snapshot = await _auto_snapshot(page, wait_for_nav=True)
+                snapshot = await _auto_snapshot(
+                    page,
+                    session=session,
+                    target_id=target_id,
+                    wait_for_nav=True,
+                )
                if snapshot:
                    result["snapshot"] = snapshot
                    result["url"] = page.url
@@ -194,6 +222,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            if not page:
                return {"ok": False, "error": "No active tab"}

+            try:
+                selector = resolve_selector(selector, session, target_id)
+            except ValueError as e:
+                return {"ok": False, "error": str(e)}
+
            await highlight_element(page, selector)

            if clear_first:
@@ -202,7 +235,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            await page.type(selector, text, delay=delay_ms, timeout=timeout_ms)
            result: dict = {"ok": True, "action": "type", "selector": selector, "length": len(text)}
            if auto_snapshot:
-                snapshot = await _auto_snapshot(page)
+                snapshot = await _auto_snapshot(page, session=session, target_id=target_id)
                if snapshot:
                    result["snapshot"] = snapshot
                    result["url"] = page.url
@@ -244,12 +277,17 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            if not page:
                return {"ok": False, "error": "No active tab"}

+            try:
+                selector = resolve_selector(selector, session, target_id)
+            except ValueError as e:
+                return {"ok": False, "error": str(e)}
+
            await highlight_element(page, selector)

            await page.fill(selector, value, timeout=timeout_ms)
            result: dict = {"ok": True, "action": "fill", "selector": selector}
            if auto_snapshot:
-                snapshot = await _auto_snapshot(page)
+                snapshot = await _auto_snapshot(page, session=session, target_id=target_id)
                if snapshot:
                    result["snapshot"] = snapshot
                    result["url"] = page.url
@@ -287,6 +325,10 @@ def register_interaction_tools(mcp: FastMCP) -> None:
                return {"ok": False, "error": "No active tab"}

            if selector:
+                try:
+                    selector = resolve_selector(selector, session, target_id)
+                except ValueError as e:
+                    return {"ok": False, "error": str(e)}
                await page.press(selector, key, timeout=timeout_ms)
            else:
                await page.keyboard.press(key)
@@ -322,6 +364,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            if not page:
                return {"ok": False, "error": "No active tab"}

+            try:
+                selector = resolve_selector(selector, session, target_id)
+            except ValueError as e:
+                return {"ok": False, "error": str(e)}
+
            await page.hover(selector, timeout=timeout_ms)
            return {"ok": True, "action": "hover", "selector": selector}
        except PlaywrightTimeout:
@@ -360,6 +407,11 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            if not page:
                return {"ok": False, "error": "No active tab"}

+            try:
+                selector = resolve_selector(selector, session, target_id)
+            except ValueError as e:
+                return {"ok": False, "error": str(e)}
+
            selected = await page.select_option(selector, values, timeout=timeout_ms)
            result: dict = {
                "ok": True,
@@ -368,7 +420,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
                "selected": selected,
            }
            if auto_snapshot:
-                snapshot = await _auto_snapshot(page)
+                snapshot = await _auto_snapshot(page, session=session, target_id=target_id)
                if snapshot:
                    result["snapshot"] = snapshot
                    result["url"] = page.url
@@ -422,6 +474,10 @@ def register_interaction_tools(mcp: FastMCP) -> None:
                delta_x = -amount

            if selector:
+                try:
+                    selector = resolve_selector(selector, session, target_id)
+                except ValueError as e:
+                    return {"ok": False, "error": str(e)}
                element = await page.query_selector(selector)
                if element:
                    await element.evaluate(f"e => e.scrollBy({delta_x}, {delta_y})")
@@ -435,7 +491,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
                "amount": amount,
            }
            if auto_snapshot:
-                snapshot = await _auto_snapshot(page)
+                snapshot = await _auto_snapshot(page, session=session, target_id=target_id)
                if snapshot:
                    result["snapshot"] = snapshot
                    result["url"] = page.url
@@ -474,6 +530,12 @@ def register_interaction_tools(mcp: FastMCP) -> None:
            if not page:
                return {"ok": False, "error": "No active tab"}

+            try:
+                start_selector = resolve_selector(start_selector, session, target_id)
+                end_selector = resolve_selector(end_selector, session, target_id)
+            except ValueError as e:
+                return {"ok": False, "error": str(e)}
+
            await page.drag_and_drop(
                start_selector,
                end_selector,
@@ -486,7 +548,7 @@ def register_interaction_tools(mcp: FastMCP) -> None:
                "to": end_selector,
            }
            if auto_snapshot:
-                snapshot = await _auto_snapshot(page)
+                snapshot = await _auto_snapshot(page, session=session, target_id=target_id)
                if snapshot:
                    result["snapshot"] = snapshot
                    result["url"] = page.url
@@ -0,0 +1,187 @@
+"""Tests for the browser ref system (annotate_snapshot / resolve_ref)."""
+
+from __future__ import annotations
+
+import pytest
+
+from gcu.browser.refs import (
+    RefEntry,
+    annotate_snapshot,
+    resolve_ref,
+)
+
+# ---------------------------------------------------------------------------
+# annotate_snapshot
+# ---------------------------------------------------------------------------
+
+SAMPLE_SNAPSHOT = """\
+- navigation "Main":
+  - link "Home"
+  - link "About"
+- main:
+  - heading "Welcome"
+  - textbox "Search"
+  - button "Submit"
+  - paragraph: some text here
+  - img "Logo"
+  - list:
+    - listitem:
+      - link "Item 1"
+    - listitem:
+      - link "Item 2\""""
+
+
+class TestAnnotateSnapshot:
+    def test_assigns_refs_to_interactive_roles(self):
+        annotated, ref_map = annotate_snapshot(SAMPLE_SNAPSHOT)
+        # link, textbox, button should all get refs
+        assert "[ref=e" in annotated
+        # Check that specific interactive elements got refs
+        roles_in_map = {entry.role for entry in ref_map.values()}
+        assert "link" in roles_in_map
+        assert "textbox" in roles_in_map
+        assert "button" in roles_in_map
+
+    def test_skips_structural_roles(self):
+        annotated, ref_map = annotate_snapshot(SAMPLE_SNAPSHOT)
+        roles_in_map = {entry.role for entry in ref_map.values()}
+        # navigation, main, list, listitem, paragraph are structural — no refs
+        assert "navigation" not in roles_in_map
+        assert "main" not in roles_in_map
+        assert "list" not in roles_in_map
+        assert "listitem" not in roles_in_map
+        assert "paragraph" not in roles_in_map
+
+    def test_named_content_roles_get_refs(self):
+        annotated, ref_map = annotate_snapshot(SAMPLE_SNAPSHOT)
+        roles_in_map = {entry.role for entry in ref_map.values()}
+        # heading and img have names, so they should get refs
+        assert "heading" in roles_in_map
+        assert "img" in roles_in_map
+
+    def test_unnamed_content_roles_skip(self):
+        snapshot = "- heading\n- img"
+        _, ref_map = annotate_snapshot(snapshot)
+        # No names → no refs for content roles
+        assert len(ref_map) == 0
+
+    def test_preserves_non_matching_lines(self):
+        snapshot = 'some random text\n- button "OK"\nanother line'
+        annotated, _ = annotate_snapshot(snapshot)
+        lines = annotated.split("\n")
+        assert lines[0] == "some random text"
+        assert lines[2] == "another line"
+
+    def test_nth_disambiguation(self):
+        snapshot = '- button "Save"\n- button "Save"\n- button "Cancel"'
+        annotated, ref_map = annotate_snapshot(snapshot)
+
+        # Two "Save" buttons should have nth=0 and nth=1
+        save_entries = [
+            (rid, e) for rid, e in ref_map.items() if e.role == "button" and e.name == "Save"
+        ]
+        assert len(save_entries) == 2
+        nths = sorted(e.nth for _, e in save_entries)
+        assert nths == [0, 1]
+
+        # "Cancel" should have nth=0
+        cancel_entries = [e for e in ref_map.values() if e.role == "button" and e.name == "Cancel"]
+        assert len(cancel_entries) == 1
+        assert cancel_entries[0].nth == 0
+
+    def test_sequential_ref_ids(self):
+        snapshot = '- link "A"\n- link "B"\n- link "C"'
+        _, ref_map = annotate_snapshot(snapshot)
+        assert set(ref_map.keys()) == {"e0", "e1", "e2"}
+
+    def test_empty_snapshot(self):
+        annotated, ref_map = annotate_snapshot("")
+        assert annotated == ""
+        assert ref_map == {}
+
+
+# ---------------------------------------------------------------------------
+# resolve_ref
+# ---------------------------------------------------------------------------
+
+
+class TestResolveRef:
+    def test_resolves_valid_ref(self):
+        ref_map = {
+            "e0": RefEntry(role="button", name="Submit", nth=0),
+        }
+        result = resolve_ref("e0", ref_map)
+        assert result == 'role=button[name="Submit"] >> nth=0'
+
+    def test_passes_through_css_selectors(self):
+        ref_map = {"e0": RefEntry(role="button", name="OK", nth=0)}
+        assert resolve_ref("#my-button", ref_map) == "#my-button"
+        assert resolve_ref(".btn-primary", ref_map) == ".btn-primary"
+        assert resolve_ref("div > button", ref_map) == "div > button"
+
+    def test_passes_through_role_selectors(self):
+        ref_map = {"e0": RefEntry(role="button", name="OK", nth=0)}
+        sel = 'role=button[name="OK"]'
+        assert resolve_ref(sel, ref_map) == sel
+
+    def test_raises_on_unknown_ref(self):
+        ref_map = {"e0": RefEntry(role="button", name="OK", nth=0)}
+        with pytest.raises(ValueError, match="not found"):
+            resolve_ref("e99", ref_map)
+
+    def test_raises_when_no_ref_map(self):
+        with pytest.raises(ValueError, match="no snapshot"):
+            resolve_ref("e0", None)
+
+    def test_escapes_quotes_in_name(self):
+        ref_map = {
+            "e0": RefEntry(role="button", name='Say "Hello"', nth=0),
+        }
+        result = resolve_ref("e0", ref_map)
+        assert result == 'role=button[name="Say \\"Hello\\""] >> nth=0'
+
+    def test_no_name_produces_role_only_selector(self):
+        ref_map = {
+            "e0": RefEntry(role="textbox", name=None, nth=0),
+        }
+        result = resolve_ref("e0", ref_map)
+        assert result == "role=textbox >> nth=0"
+
+    def test_empty_name(self):
+        ref_map = {
+            "e0": RefEntry(role="button", name="", nth=0),
+        }
+        result = resolve_ref("e0", ref_map)
+        assert result == 'role=button[name=""] >> nth=0'
+
+    def test_nth_in_selector(self):
+        ref_map = {
+            "e0": RefEntry(role="link", name="Next", nth=2),
+        }
+        result = resolve_ref("e0", ref_map)
+        assert result == 'role=link[name="Next"] >> nth=2'
+
+
+# ---------------------------------------------------------------------------
+# Round-trip: annotate → resolve
+# ---------------------------------------------------------------------------
+
+
+class TestRoundTrip:
+    def test_annotate_then_resolve(self):
+        snapshot = '- button "Submit"\n- textbox "Email"\n- link "Home"'
+        _, ref_map = annotate_snapshot(snapshot)
+
+        # Each ref should resolve to a valid Playwright role selector
+        for ref_id, entry in ref_map.items():
+            resolved = resolve_ref(ref_id, ref_map)
+            assert resolved.startswith(f"role={entry.role}")
+            if entry.name is not None:
+                assert f'name="{entry.name}"' in resolved
+            assert f"nth={entry.nth}" in resolved
+
+    def test_css_selectors_still_work_after_annotate(self):
+        snapshot = '- button "OK"'
+        _, ref_map = annotate_snapshot(snapshot)
+        # CSS selectors pass through even when a ref_map exists
+        assert resolve_ref("#submit-btn", ref_map) == "#submit-btn"
@@ -0,0 +1,159 @@
+"""Tests for screenshot normalization.
+
+Requires the ``browser`` extra (Pillow).  Skipped automatically when
+Pillow is not installed.
+"""
+
+from __future__ import annotations
+
+import io
+from unittest.mock import patch
+
+import pytest
+
+Image = pytest.importorskip(
+    "PIL.Image", reason="Pillow not installed (install with: pip install pillow)"
+)
+
+from gcu.browser.tools.inspection import _normalize_screenshot  # noqa: E402
+
+
+def _make_png(width: int, height: int, *, mode: str = "RGB") -> bytes:
+    """Create a solid-color PNG image of the given size."""
+    img = Image.new(
+        mode, (width, height), color=(100, 150, 200) if mode == "RGB" else (100, 150, 200, 128)
+    )
+    buf = io.BytesIO()
+    img.save(buf, format="PNG")
+    return buf.getvalue()
+
+
+def _make_large_png(width: int, height: int, min_bytes: int) -> bytes:
+    """Create a PNG that's at least *min_bytes* by using random-ish pixel data."""
+    # Gradient with noise produces poorly-compressible PNGs
+    img = Image.new("RGB", (width, height))
+    pixels = img.load()
+    for y in range(height):
+        for x in range(width):
+            pixels[x, y] = ((x * 7 + y * 13) % 256, (x * 11 + y * 3) % 256, (x * 5 + y * 17) % 256)
+    buf = io.BytesIO()
+    img.save(buf, format="PNG")
+    raw = buf.getvalue()
+    # If still under target, that's fine for most tests — the important
+    # thing is we have a large-dimension image.
+    return raw
+
+
+class TestPassthrough:
+    """Images already within limits should pass through unchanged."""
+
+    def test_small_image_unchanged(self):
+        raw = _make_png(100, 100)
+        result_bytes, result_type = _normalize_screenshot(raw, "png")
+        assert result_bytes is raw
+        assert result_type == "png"
+
+    def test_within_dimension_and_size_unchanged(self):
+        raw = _make_png(1920, 1080)
+        result_bytes, result_type = _normalize_screenshot(raw, "png")
+        assert result_bytes is raw
+        assert result_type == "png"
+
+
+class TestDimensionResize:
+    """Images exceeding max_dimension should be resized."""
+
+    def test_large_dimension_gets_resized(self):
+        raw = _make_png(4000, 3000)
+        result_bytes, result_type = _normalize_screenshot(raw, "png")
+
+        # Should be JPEG after normalization
+        assert result_type == "jpeg"
+
+        # Verify dimensions are within limit
+        img = Image.open(io.BytesIO(result_bytes))
+        assert max(img.size) <= 2000
+
+    def test_custom_max_dimension(self):
+        raw = _make_png(2000, 1500)
+        result_bytes, result_type = _normalize_screenshot(raw, "png", max_dimension=800)
+        assert result_type == "jpeg"
+
+        img = Image.open(io.BytesIO(result_bytes))
+        assert max(img.size) <= 800
+
+    def test_aspect_ratio_preserved(self):
+        raw = _make_png(4000, 2000)  # 2:1 ratio
+        result_bytes, _ = _normalize_screenshot(raw, "png")
+
+        img = Image.open(io.BytesIO(result_bytes))
+        w, h = img.size
+        ratio = w / h
+        assert abs(ratio - 2.0) < 0.1  # Allow small rounding error
+
+
+class TestSizeCompression:
+    """Images exceeding max_bytes should be compressed."""
+
+    def test_custom_max_bytes(self):
+        raw = _make_large_png(1500, 1500, min_bytes=100_000)
+        result_bytes, result_type = _normalize_screenshot(raw, "png", max_bytes=50_000)
+        assert result_type == "jpeg"
+        assert len(result_bytes) <= 50_000
+
+    def test_over_size_within_dimension_compresses(self):
+        """Image within dimension limit but over byte limit gets JPEG-compressed."""
+        raw = _make_large_png(1800, 1800, min_bytes=100_000)
+        result_bytes, result_type = _normalize_screenshot(raw, "png", max_bytes=50_000)
+        assert result_type == "jpeg"
+        assert len(result_bytes) <= 50_000
+
+
+class TestAlphaChannel:
+    """RGBA images should be converted to RGB for JPEG output."""
+
+    def test_rgba_to_rgb(self):
+        raw = _make_png(4000, 3000, mode="RGBA")
+        result_bytes, result_type = _normalize_screenshot(raw, "png")
+
+        assert result_type == "jpeg"
+        img = Image.open(io.BytesIO(result_bytes))
+        assert img.mode == "RGB"
+
+
+class TestGracefulDegradation:
+    """Normalization should never break screenshots."""
+
+    def test_pillow_not_available(self):
+        raw = _make_png(4000, 3000)
+        with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}):
+            # Need to force reimport failure — patch builtins.__import__
+            original_import = (
+                __builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__
+            )
+
+            def mock_import(name, *args, **kwargs):
+                if name == "PIL" or name.startswith("PIL."):
+                    raise ImportError("No module named 'PIL'")
+                return original_import(name, *args, **kwargs)
+
+            with patch("builtins.__import__", side_effect=mock_import):
+                result_bytes, result_type = _normalize_screenshot(raw, "png")
+
+        # Should return original unchanged
+        assert result_bytes is raw
+        assert result_type == "png"
+
+    def test_corrupt_bytes_returns_original(self):
+        raw = b"not an image at all"
+        result_bytes, result_type = _normalize_screenshot(raw, "png")
+
+        assert result_bytes is raw
+        assert result_type == "png"
+
+    def test_empty_bytes_returns_original(self):
+        raw = b""
+        result_bytes, result_type = _normalize_screenshot(raw, "png")
+
+        assert result_bytes is raw
+        assert result_type == "png"
@@ -3523,6 +3523,9 @@ all = [
 bigquery = [
    { name = "google-cloud-bigquery" },
 ]
+browser = [
+    { name = "pillow" },
+]
 databricks = [
    { name = "databricks-mcp" },
    { name = "databricks-sdk" },
@@ -3577,6 +3580,7 @@ requires-dist = [
    { name = "openpyxl", marker = "extra == 'excel'", specifier = ">=3.1.0" },
    { name = "pandas", specifier = ">=2.0.0" },
    { name = "pillow", marker = "extra == 'all'", specifier = ">=10.0.0" },
+    { name = "pillow", marker = "extra == 'browser'", specifier = ">=10.0.0" },
    { name = "pillow", marker = "extra == 'ocr'", specifier = ">=10.0.0" },
    { name = "playwright", specifier = ">=1.40.0" },
    { name = "playwright-stealth", specifier = ">=1.0.5" },
@@ -3594,7 +3598,7 @@ requires-dist = [
    { name = "restrictedpython", marker = "extra == 'sandbox'", specifier = ">=7.0" },
    { name = "stripe", specifier = ">=14.3.0" },
 ]
-provides-extras = ["dev", "sandbox", "ocr", "excel", "sql", "bigquery", "databricks", "all"]
+provides-extras = ["dev", "sandbox", "browser", "ocr", "excel", "sql", "bigquery", "databricks", "all"]

 [package.metadata.requires-dev]
 dev = [