diff --git a/core/framework/agent_loop/agent_loop.py b/core/framework/agent_loop/agent_loop.py index 6832bc86..4de48e2f 100644 --- a/core/framework/agent_loop/agent_loop.py +++ b/core/framework/agent_loop/agent_loop.py @@ -87,7 +87,7 @@ from framework.agent_loop.internals.types import ( ) from framework.agent_loop.types import AgentContext, AgentProtocol, AgentResult from framework.host.event_bus import EventBus -from framework.llm.capabilities import supports_image_tool_results +from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results from framework.llm.provider import Tool, ToolResult, ToolUse from framework.llm.stream_events import ( FinishEvent, @@ -562,13 +562,20 @@ class AgentLoop(AgentProtocol): if isinstance(stream_id, str) and stream_id.startswith("worker:"): tools.append(build_report_to_parent_tool()) + # Hide image-producing tools from text-only models so they never try + # to call them. Avoids wasted turns + "screenshot failed" lessons + # getting saved to memory. See framework.llm.capabilities. + _llm_model = ctx.llm.model if ctx.llm else "" + tools, _hidden_image_tools = filter_tools_for_model(tools, _llm_model) + logger.info( - "[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s", + "[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s | hidden_image_tools=%s", node_id, len(tools), [t.name for t in tools], ctx.supports_direct_user_io, type(self._judge).__name__ if self._judge else "None", + _hidden_image_tools, ) # 4. Publish loop started diff --git a/core/framework/agents/queen/nodes/__init__.py b/core/framework/agents/queen/nodes/__init__.py index a7ce284f..759cbccb 100644 --- a/core/framework/agents/queen/nodes/__init__.py +++ b/core/framework/agents/queen/nodes/__init__.py @@ -1,5 +1,6 @@ """Node definitions for Queen agent.""" +import re from pathlib import Path from framework.orchestrator import NodeSpec @@ -32,6 +33,29 @@ def _build_appendices() -> str: return parts +# Wraps prompt sections that should only be shown to vision-capable models. +# Content inside `...` is kept for +# vision models and stripped for text-only models. Applied once per session +# in queen_orchestrator.create_queen. +_VISION_ONLY_BLOCK_RE = re.compile( + r"(.*?)", + re.DOTALL, +) + + +def finalize_queen_prompt(text: str, has_vision: bool) -> str: + """Resolve `` blocks based on model capability. + + For vision-capable models the markers are stripped and the inner + content is kept. For text-only models the whole block (markers + + content) is removed so the queen is never nudged toward tools it + cannot usefully invoke. + """ + if has_vision: + return _VISION_ONLY_BLOCK_RE.sub(r"\1", text) + return _VISION_ONLY_BLOCK_RE.sub("", text) + + # Shared appendices — appended to every coding node's system prompt. _appendices = _build_appendices() @@ -502,7 +526,7 @@ The queen writes final production-ready system prompts directly. MCP servers are loaded from the global registry by name. Available servers: - `hive_tools` — web search, email, CRM, calendar, 100+ integrations -- `gcu-tools` — browser automation (click, type, navigate, screenshot) +- `gcu-tools` — browser automation (click, type, navigate, screenshot) - `files-tools` — file I/O (read, write, edit, search, list) **Template variables:** Add a `variables:` section at the top of agent.json \ @@ -807,7 +831,7 @@ search_files, run_command, undo_changes ## Browser Automation (gcu-tools MCP) All browser tools are prefixed with `browser_` (browser_start, browser_navigate, \ -browser_click, browser_fill, browser_snapshot, browser_screenshot, browser_scroll, \ +browser_click, browser_fill, browser_snapshot, browser_screenshot, browser_scroll, \ browser_tabs, browser_close, browser_evaluate, etc.). Follow the browser-automation skill protocol — activate it before using browser tools. diff --git a/core/framework/agents/queen/reference/gcu_guide.md b/core/framework/agents/queen/reference/gcu_guide.md index cf254637..fbb1cb70 100644 --- a/core/framework/agents/queen/reference/gcu_guide.md +++ b/core/framework/agents/queen/reference/gcu_guide.md @@ -20,7 +20,9 @@ All tools are prefixed with `browser_`: - `browser_start`, `browser_open` -- launch/navigate - `browser_click`, `browser_fill`, `browser_type` -- interact - `browser_snapshot` -- read page content (preferred over screenshot) + - `browser_screenshot` -- visual capture + - `browser_scroll`, `browser_wait` -- navigation helpers - `browser_evaluate` -- run JavaScript diff --git a/core/framework/llm/capabilities.py b/core/framework/llm/capabilities.py index 4a666d8d..8a0aaa83 100644 --- a/core/framework/llm/capabilities.py +++ b/core/framework/llm/capabilities.py @@ -12,6 +12,11 @@ Vision support rules are derived from official vendor documentation: from __future__ import annotations +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from framework.llm.provider import Tool + def _model_name(model: str) -> str: """Return the bare model name after stripping any 'provider/' prefix.""" @@ -104,3 +109,22 @@ def supports_image_tool_results(model: str) -> bool: # 5. Default: assume vision capable # Covers: OpenAI, Anthropic, Google, Mistral, Kimi, and other hosted providers return True + + +def filter_tools_for_model(tools: list[Tool], model: str) -> tuple[list[Tool], list[str]]: + """Drop image-producing tools for text-only models. + + Returns ``(filtered_tools, hidden_names)``. For vision-capable models + (or when *model* is empty) the input list is returned unchanged and + ``hidden_names`` is empty. For text-only models any tool with + ``produces_image=True`` is removed so the LLM never sees it in its + schema — avoids wasted calls and stale "screenshot failed" entries + in agent memory. + """ + if not model or supports_image_tool_results(model): + return list(tools), [] + hidden = [t.name for t in tools if t.produces_image] + if not hidden: + return list(tools), [] + kept = [t for t in tools if not t.produces_image] + return kept, hidden diff --git a/core/framework/llm/provider.py b/core/framework/llm/provider.py index fcdaa61d..6300bb3f 100644 --- a/core/framework/llm/provider.py +++ b/core/framework/llm/provider.py @@ -27,6 +27,9 @@ class Tool: name: str description: str parameters: dict[str, Any] = field(default_factory=dict) + # If True, the tool may return ImageContent in its result. Text-only models + # (e.g. glm-5, deepseek-chat) have this hidden from their schema entirely. + produces_image: bool = False @dataclass diff --git a/core/framework/loader/tool_registry.py b/core/framework/loader/tool_registry.py index 198c7537..13002ad3 100644 --- a/core/framework/loader/tool_registry.py +++ b/core/framework/loader/tool_registry.py @@ -7,6 +7,7 @@ import inspect import json import logging import os +import re from collections.abc import Callable from dataclasses import dataclass from pathlib import Path @@ -18,6 +19,16 @@ logger = logging.getLogger(__name__) _INPUT_LOG_MAX_LEN = 500 +# Tools whose names match this pattern are assumed to return ImageContent. +# Matched against the bare tool name (case-insensitive). Used to mark MCP +# tools with produces_image=True so they can be filtered out for text-only +# models before the schema is ever shown to the LLM (avoids wasted calls +# and "screenshot failed" entries polluting memory). +_IMAGE_TOOL_NAME_RE = re.compile( + r"(screenshot|screen_capture|capture_image|render_image|get_image|snapshot_image)", + re.IGNORECASE, +) + # Per-execution context overrides. Each asyncio task (and thus each # concurrent graph execution) gets its own copy, so there are no races # when multiple ExecutionStreams run in parallel. @@ -925,6 +936,7 @@ class ToolRegistry: "properties": properties, "required": required, }, + produces_image=bool(_IMAGE_TOOL_NAME_RE.search(mcp_tool.name or "")), ) return tool diff --git a/core/framework/server/queen_orchestrator.py b/core/framework/server/queen_orchestrator.py index 04f7b82d..ce905068 100644 --- a/core/framework/server/queen_orchestrator.py +++ b/core/framework/server/queen_orchestrator.py @@ -311,7 +311,9 @@ async def create_queen( _queen_tools_running, _queen_tools_staging, _shared_building_knowledge, + finalize_queen_prompt, ) + from framework.llm.capabilities import supports_image_tool_results from framework.host.event_bus import AgentEvent, EventType from framework.loader.mcp_registry import MCPRegistry from framework.loader.tool_registry import ToolRegistry @@ -489,6 +491,13 @@ async def create_queen( "according to your current phase." ) + # Resolve vision-only prompt sections based on the session's LLM. + # session.llm is immutable for the session's lifetime, so this check + # is stable — prompts never need to be recomposed mid-session. + _has_vision = bool( + session.llm and supports_image_tool_results(getattr(session.llm, "model", "")) + ) + _planning_body = ( _queen_character_core + _queen_role_planning @@ -500,7 +509,7 @@ async def create_queen( + _planning_knowledge + worker_identity ) - phase_state.prompt_planning = _planning_body + phase_state.prompt_planning = finalize_queen_prompt(_planning_body, _has_vision) _building_body = ( _queen_character_core @@ -515,40 +524,52 @@ async def create_queen( + _appendices + worker_identity ) - phase_state.prompt_building = _building_body - phase_state.prompt_staging = ( - _queen_character_core - + _queen_role_staging - + _queen_style - + _queen_tools_staging - + _queen_behavior_always - + _queen_behavior_staging - + worker_identity + phase_state.prompt_building = finalize_queen_prompt(_building_body, _has_vision) + phase_state.prompt_staging = finalize_queen_prompt( + ( + _queen_character_core + + _queen_role_staging + + _queen_style + + _queen_tools_staging + + _queen_behavior_always + + _queen_behavior_staging + + worker_identity + ), + _has_vision, ) - phase_state.prompt_running = ( - _queen_character_core - + _queen_role_running - + _queen_style - + _queen_tools_running - + _queen_behavior_always - + _queen_behavior_running - + worker_identity + phase_state.prompt_running = finalize_queen_prompt( + ( + _queen_character_core + + _queen_role_running + + _queen_style + + _queen_tools_running + + _queen_behavior_always + + _queen_behavior_running + + worker_identity + ), + _has_vision, ) - phase_state.prompt_editing = ( - _queen_identity_editing - + _queen_style - + _queen_tools_editing - + _queen_behavior_always - + _queen_behavior_editing - + worker_identity + phase_state.prompt_editing = finalize_queen_prompt( + ( + _queen_identity_editing + + _queen_style + + _queen_tools_editing + + _queen_behavior_always + + _queen_behavior_editing + + worker_identity + ), + _has_vision, ) - phase_state.prompt_independent = ( - _queen_character_core - + _queen_role_independent - + _queen_style - + _queen_tools_independent - + _queen_behavior_always - + _queen_behavior_independent + phase_state.prompt_independent = finalize_queen_prompt( + ( + _queen_character_core + + _queen_role_independent + + _queen_style + + _queen_tools_independent + + _queen_behavior_always + + _queen_behavior_independent + ), + _has_vision, ) # ---- Default skill protocols ------------------------------------- diff --git a/core/tests/test_capabilities.py b/core/tests/test_capabilities.py index a6b84e3d..ba391957 100644 --- a/core/tests/test_capabilities.py +++ b/core/tests/test_capabilities.py @@ -4,7 +4,8 @@ from __future__ import annotations import pytest -from framework.llm.capabilities import supports_image_tool_results +from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results +from framework.llm.provider import Tool class TestSupportsImageToolResults: @@ -56,3 +57,56 @@ class TestSupportsImageToolResults: assert supports_image_tool_results("DeepSeek/deepseek-chat") is False assert supports_image_tool_results("OLLAMA/llama3") is False assert supports_image_tool_results("GPT-4o") is True + + +class TestFilterToolsForModel: + """Verify ``filter_tools_for_model`` — the real helper used by AgentLoop.""" + + def test_hides_image_tool_from_text_only_model(self): + tools = [ + Tool(name="read_file", description="read a file"), + Tool(name="browser_screenshot", description="take a screenshot", produces_image=True), + Tool(name="browser_snapshot", description="get page content"), + ] + filtered, hidden = filter_tools_for_model(tools, "glm-5") + names = [t.name for t in filtered] + assert "browser_screenshot" not in names + assert "read_file" in names + assert "browser_snapshot" in names + assert hidden == ["browser_screenshot"] + + def test_keeps_image_tool_for_vision_model(self): + tools = [ + Tool(name="read_file", description="read a file"), + Tool(name="browser_screenshot", description="take a screenshot", produces_image=True), + ] + filtered, hidden = filter_tools_for_model(tools, "claude-sonnet-4-20250514") + assert {t.name for t in filtered} == {"read_file", "browser_screenshot"} + assert hidden == [] + + def test_default_tools_are_not_filtered(self): + """Tools without produces_image (default False) are kept for all models.""" + tools = [ + Tool(name="read_file", description="read a file"), + Tool(name="web_search", description="search the web"), + ] + text_only, text_hidden = filter_tools_for_model(tools, "glm-5") + vision, vision_hidden = filter_tools_for_model(tools, "gpt-4o") + assert len(text_only) == 2 and text_hidden == [] + assert len(vision) == 2 and vision_hidden == [] + + def test_empty_model_string_returns_tools_unchanged(self): + """Guards the ctx.llm-missing path where model is empty.""" + tools = [ + Tool(name="browser_screenshot", description="", produces_image=True), + ] + filtered, hidden = filter_tools_for_model(tools, "") + assert len(filtered) == 1 + assert hidden == [] + + def test_returned_list_is_a_copy(self): + """Caller should be free to mutate the filtered list without affecting input.""" + tools = [Tool(name="read_file", description="")] + filtered, _ = filter_tools_for_model(tools, "gpt-4o") + filtered.append(Tool(name="extra", description="")) + assert len(tools) == 1 diff --git a/core/tests/test_queen_nodes_prompt.py b/core/tests/test_queen_nodes_prompt.py new file mode 100644 index 00000000..e1660434 --- /dev/null +++ b/core/tests/test_queen_nodes_prompt.py @@ -0,0 +1,65 @@ +"""Tests for vision-only prompt block stripping in Queen nodes. + +Covers ``finalize_queen_prompt`` — the function that resolves +``...`` markers in Queen phase +prompts before they reach the LLM. Vision-capable models see the inner +content; text-only models see the block removed entirely. +""" + +from __future__ import annotations + +from framework.agents.queen.nodes import finalize_queen_prompt + + +class TestFinalizeQueenPrompt: + def test_vision_model_keeps_inner_content_and_strips_markers(self): + text = "before secret after" + result = finalize_queen_prompt(text, has_vision=True) + assert result == "before secret after" + + def test_text_only_model_removes_entire_block(self): + text = "before secret after" + result = finalize_queen_prompt(text, has_vision=False) + assert result == "before after" + assert "secret" not in result + assert "vision-only" not in result + + def test_multiline_block_handled(self): + """Regex must use DOTALL so blocks can span newlines.""" + text = ( + "- item 1\n" + "\n" + "- item 2 (vision only)\n" + "\n" + "- item 3\n" + ) + vision = finalize_queen_prompt(text, has_vision=True) + text_only = finalize_queen_prompt(text, has_vision=False) + assert "- item 2 (vision only)" in vision + assert "- item 2 (vision only)" not in text_only + assert "- item 1" in text_only and "- item 3" in text_only + + def test_multiple_blocks_in_same_text(self): + text = ( + "A X " + "B Y C" + ) + assert finalize_queen_prompt(text, has_vision=True) == "A X B Y C" + assert finalize_queen_prompt(text, has_vision=False) == "A B C" + + def test_non_greedy_match_does_not_swallow_between_blocks(self): + """A naïve greedy regex would match from the first opening marker + to the last closing marker and wipe out the middle section. Lock + that down so a future refactor can't regress to greedy.""" + text = ( + "first" + "KEEP" + "second" + ) + assert finalize_queen_prompt(text, has_vision=False) == "KEEP" + assert finalize_queen_prompt(text, has_vision=True) == "firstKEEPsecond" + + def test_text_without_markers_is_unchanged(self): + text = "plain prompt with no markers at all" + assert finalize_queen_prompt(text, has_vision=True) == text + assert finalize_queen_prompt(text, has_vision=False) == text diff --git a/core/tests/test_tool_registry.py b/core/tests/test_tool_registry.py index 8e5bcdd3..4d2212f9 100644 --- a/core/tests/test_tool_registry.py +++ b/core/tests/test_tool_registry.py @@ -797,3 +797,52 @@ def test_resync_returns_false_when_credentials_unchanged(tmp_path, monkeypatch): monkeypatch.setattr(registry, "_snapshot_credentials", lambda: set()) assert registry.resync_mcp_servers_if_needed() is False + + +class TestMcpToolProducesImageFlag: + """Verify _convert_mcp_tool_to_framework_tool sets produces_image from the name. + + This is the detection step that the filter in AgentLoop depends on — + if the regex regresses, text-only models will start seeing screenshot + tools they can't use. + """ + + @staticmethod + def _mcp_tool(name: str): + return SimpleNamespace( + name=name, + description=f"{name} description", + input_schema={"type": "object", "properties": {}, "required": []}, + server_name="test", + ) + + def test_screenshot_flagged(self): + registry = ToolRegistry() + mcp = self._mcp_tool("browser_screenshot") + tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001 + assert tool.produces_image is True + + def test_snapshot_not_flagged(self): + """browser_snapshot returns a DOM tree, not an image — must not match.""" + registry = ToolRegistry() + mcp = self._mcp_tool("browser_snapshot") + tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001 + assert tool.produces_image is False + + def test_case_insensitive_match(self): + registry = ToolRegistry() + mcp = self._mcp_tool("TakeScreenshot") + tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001 + assert tool.produces_image is True + + def test_plain_tool_not_flagged(self): + registry = ToolRegistry() + mcp = self._mcp_tool("read_file") + tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001 + assert tool.produces_image is False + + def test_image_suffix_variants_flagged(self): + registry = ToolRegistry() + for name in ("capture_image", "render_image", "get_image", "snapshot_image"): + tool = registry._convert_mcp_tool_to_framework_tool(self._mcp_tool(name)) # noqa: SLF001 + assert tool.produces_image is True, f"{name} should be flagged"