Merge remote-tracking branch 'origin/feat/text-only-tool-filter' into feature/new-colony
This commit is contained in:
@@ -87,7 +87,7 @@ from framework.agent_loop.internals.types import (
|
||||
)
|
||||
from framework.agent_loop.types import AgentContext, AgentProtocol, AgentResult
|
||||
from framework.host.event_bus import EventBus
|
||||
from framework.llm.capabilities import supports_image_tool_results
|
||||
from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results
|
||||
from framework.llm.provider import Tool, ToolResult, ToolUse
|
||||
from framework.llm.stream_events import (
|
||||
FinishEvent,
|
||||
@@ -562,13 +562,20 @@ class AgentLoop(AgentProtocol):
|
||||
if isinstance(stream_id, str) and stream_id.startswith("worker:"):
|
||||
tools.append(build_report_to_parent_tool())
|
||||
|
||||
# Hide image-producing tools from text-only models so they never try
|
||||
# to call them. Avoids wasted turns + "screenshot failed" lessons
|
||||
# getting saved to memory. See framework.llm.capabilities.
|
||||
_llm_model = ctx.llm.model if ctx.llm else ""
|
||||
tools, _hidden_image_tools = filter_tools_for_model(tools, _llm_model)
|
||||
|
||||
logger.info(
|
||||
"[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s",
|
||||
"[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s | hidden_image_tools=%s",
|
||||
node_id,
|
||||
len(tools),
|
||||
[t.name for t in tools],
|
||||
ctx.supports_direct_user_io,
|
||||
type(self._judge).__name__ if self._judge else "None",
|
||||
_hidden_image_tools,
|
||||
)
|
||||
|
||||
# 4. Publish loop started
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
"""Node definitions for Queen agent."""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from framework.orchestrator import NodeSpec
|
||||
@@ -32,6 +33,29 @@ def _build_appendices() -> str:
|
||||
return parts
|
||||
|
||||
|
||||
# Wraps prompt sections that should only be shown to vision-capable models.
|
||||
# Content inside `<!-- vision-only -->...<!-- /vision-only -->` is kept for
|
||||
# vision models and stripped for text-only models. Applied once per session
|
||||
# in queen_orchestrator.create_queen.
|
||||
_VISION_ONLY_BLOCK_RE = re.compile(
|
||||
r"<!-- vision-only -->(.*?)<!-- /vision-only -->",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
def finalize_queen_prompt(text: str, has_vision: bool) -> str:
|
||||
"""Resolve `<!-- vision-only -->` blocks based on model capability.
|
||||
|
||||
For vision-capable models the markers are stripped and the inner
|
||||
content is kept. For text-only models the whole block (markers +
|
||||
content) is removed so the queen is never nudged toward tools it
|
||||
cannot usefully invoke.
|
||||
"""
|
||||
if has_vision:
|
||||
return _VISION_ONLY_BLOCK_RE.sub(r"\1", text)
|
||||
return _VISION_ONLY_BLOCK_RE.sub("", text)
|
||||
|
||||
|
||||
# Shared appendices — appended to every coding node's system prompt.
|
||||
_appendices = _build_appendices()
|
||||
|
||||
@@ -502,7 +526,7 @@ The queen writes final production-ready system prompts directly.
|
||||
|
||||
MCP servers are loaded from the global registry by name. Available servers:
|
||||
- `hive_tools` — web search, email, CRM, calendar, 100+ integrations
|
||||
- `gcu-tools` — browser automation (click, type, navigate, screenshot)
|
||||
- `gcu-tools` — browser automation (click, type, navigate<!-- vision-only -->, screenshot<!-- /vision-only -->)
|
||||
- `files-tools` — file I/O (read, write, edit, search, list)
|
||||
|
||||
**Template variables:** Add a `variables:` section at the top of agent.json \
|
||||
@@ -807,7 +831,7 @@ search_files, run_command, undo_changes
|
||||
|
||||
## Browser Automation (gcu-tools MCP)
|
||||
All browser tools are prefixed with `browser_` (browser_start, browser_navigate, \
|
||||
browser_click, browser_fill, browser_snapshot, browser_screenshot, browser_scroll, \
|
||||
browser_click, browser_fill, browser_snapshot, <!-- vision-only -->browser_screenshot, <!-- /vision-only -->browser_scroll, \
|
||||
browser_tabs, browser_close, browser_evaluate, etc.).
|
||||
Follow the browser-automation skill protocol — activate it before using browser tools.
|
||||
|
||||
|
||||
@@ -20,7 +20,9 @@ All tools are prefixed with `browser_`:
|
||||
- `browser_start`, `browser_open` -- launch/navigate
|
||||
- `browser_click`, `browser_fill`, `browser_type` -- interact
|
||||
- `browser_snapshot` -- read page content (preferred over screenshot)
|
||||
<!-- vision-only -->
|
||||
- `browser_screenshot` -- visual capture
|
||||
<!-- /vision-only -->
|
||||
- `browser_scroll`, `browser_wait` -- navigation helpers
|
||||
- `browser_evaluate` -- run JavaScript
|
||||
|
||||
|
||||
@@ -12,6 +12,11 @@ Vision support rules are derived from official vendor documentation:
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from framework.llm.provider import Tool
|
||||
|
||||
|
||||
def _model_name(model: str) -> str:
|
||||
"""Return the bare model name after stripping any 'provider/' prefix."""
|
||||
@@ -104,3 +109,22 @@ def supports_image_tool_results(model: str) -> bool:
|
||||
# 5. Default: assume vision capable
|
||||
# Covers: OpenAI, Anthropic, Google, Mistral, Kimi, and other hosted providers
|
||||
return True
|
||||
|
||||
|
||||
def filter_tools_for_model(tools: list[Tool], model: str) -> tuple[list[Tool], list[str]]:
|
||||
"""Drop image-producing tools for text-only models.
|
||||
|
||||
Returns ``(filtered_tools, hidden_names)``. For vision-capable models
|
||||
(or when *model* is empty) the input list is returned unchanged and
|
||||
``hidden_names`` is empty. For text-only models any tool with
|
||||
``produces_image=True`` is removed so the LLM never sees it in its
|
||||
schema — avoids wasted calls and stale "screenshot failed" entries
|
||||
in agent memory.
|
||||
"""
|
||||
if not model or supports_image_tool_results(model):
|
||||
return list(tools), []
|
||||
hidden = [t.name for t in tools if t.produces_image]
|
||||
if not hidden:
|
||||
return list(tools), []
|
||||
kept = [t for t in tools if not t.produces_image]
|
||||
return kept, hidden
|
||||
|
||||
@@ -27,6 +27,9 @@ class Tool:
|
||||
name: str
|
||||
description: str
|
||||
parameters: dict[str, Any] = field(default_factory=dict)
|
||||
# If True, the tool may return ImageContent in its result. Text-only models
|
||||
# (e.g. glm-5, deepseek-chat) have this hidden from their schema entirely.
|
||||
produces_image: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -7,6 +7,7 @@ import inspect
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
@@ -18,6 +19,16 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
_INPUT_LOG_MAX_LEN = 500
|
||||
|
||||
# Tools whose names match this pattern are assumed to return ImageContent.
|
||||
# Matched against the bare tool name (case-insensitive). Used to mark MCP
|
||||
# tools with produces_image=True so they can be filtered out for text-only
|
||||
# models before the schema is ever shown to the LLM (avoids wasted calls
|
||||
# and "screenshot failed" entries polluting memory).
|
||||
_IMAGE_TOOL_NAME_RE = re.compile(
|
||||
r"(screenshot|screen_capture|capture_image|render_image|get_image|snapshot_image)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Per-execution context overrides. Each asyncio task (and thus each
|
||||
# concurrent graph execution) gets its own copy, so there are no races
|
||||
# when multiple ExecutionStreams run in parallel.
|
||||
@@ -925,6 +936,7 @@ class ToolRegistry:
|
||||
"properties": properties,
|
||||
"required": required,
|
||||
},
|
||||
produces_image=bool(_IMAGE_TOOL_NAME_RE.search(mcp_tool.name or "")),
|
||||
)
|
||||
|
||||
return tool
|
||||
|
||||
@@ -311,7 +311,9 @@ async def create_queen(
|
||||
_queen_tools_running,
|
||||
_queen_tools_staging,
|
||||
_shared_building_knowledge,
|
||||
finalize_queen_prompt,
|
||||
)
|
||||
from framework.llm.capabilities import supports_image_tool_results
|
||||
from framework.host.event_bus import AgentEvent, EventType
|
||||
from framework.loader.mcp_registry import MCPRegistry
|
||||
from framework.loader.tool_registry import ToolRegistry
|
||||
@@ -489,6 +491,13 @@ async def create_queen(
|
||||
"according to your current phase."
|
||||
)
|
||||
|
||||
# Resolve vision-only prompt sections based on the session's LLM.
|
||||
# session.llm is immutable for the session's lifetime, so this check
|
||||
# is stable — prompts never need to be recomposed mid-session.
|
||||
_has_vision = bool(
|
||||
session.llm and supports_image_tool_results(getattr(session.llm, "model", ""))
|
||||
)
|
||||
|
||||
_planning_body = (
|
||||
_queen_character_core
|
||||
+ _queen_role_planning
|
||||
@@ -500,7 +509,7 @@ async def create_queen(
|
||||
+ _planning_knowledge
|
||||
+ worker_identity
|
||||
)
|
||||
phase_state.prompt_planning = _planning_body
|
||||
phase_state.prompt_planning = finalize_queen_prompt(_planning_body, _has_vision)
|
||||
|
||||
_building_body = (
|
||||
_queen_character_core
|
||||
@@ -515,40 +524,52 @@ async def create_queen(
|
||||
+ _appendices
|
||||
+ worker_identity
|
||||
)
|
||||
phase_state.prompt_building = _building_body
|
||||
phase_state.prompt_staging = (
|
||||
_queen_character_core
|
||||
+ _queen_role_staging
|
||||
+ _queen_style
|
||||
+ _queen_tools_staging
|
||||
+ _queen_behavior_always
|
||||
+ _queen_behavior_staging
|
||||
+ worker_identity
|
||||
phase_state.prompt_building = finalize_queen_prompt(_building_body, _has_vision)
|
||||
phase_state.prompt_staging = finalize_queen_prompt(
|
||||
(
|
||||
_queen_character_core
|
||||
+ _queen_role_staging
|
||||
+ _queen_style
|
||||
+ _queen_tools_staging
|
||||
+ _queen_behavior_always
|
||||
+ _queen_behavior_staging
|
||||
+ worker_identity
|
||||
),
|
||||
_has_vision,
|
||||
)
|
||||
phase_state.prompt_running = (
|
||||
_queen_character_core
|
||||
+ _queen_role_running
|
||||
+ _queen_style
|
||||
+ _queen_tools_running
|
||||
+ _queen_behavior_always
|
||||
+ _queen_behavior_running
|
||||
+ worker_identity
|
||||
phase_state.prompt_running = finalize_queen_prompt(
|
||||
(
|
||||
_queen_character_core
|
||||
+ _queen_role_running
|
||||
+ _queen_style
|
||||
+ _queen_tools_running
|
||||
+ _queen_behavior_always
|
||||
+ _queen_behavior_running
|
||||
+ worker_identity
|
||||
),
|
||||
_has_vision,
|
||||
)
|
||||
phase_state.prompt_editing = (
|
||||
_queen_identity_editing
|
||||
+ _queen_style
|
||||
+ _queen_tools_editing
|
||||
+ _queen_behavior_always
|
||||
+ _queen_behavior_editing
|
||||
+ worker_identity
|
||||
phase_state.prompt_editing = finalize_queen_prompt(
|
||||
(
|
||||
_queen_identity_editing
|
||||
+ _queen_style
|
||||
+ _queen_tools_editing
|
||||
+ _queen_behavior_always
|
||||
+ _queen_behavior_editing
|
||||
+ worker_identity
|
||||
),
|
||||
_has_vision,
|
||||
)
|
||||
phase_state.prompt_independent = (
|
||||
_queen_character_core
|
||||
+ _queen_role_independent
|
||||
+ _queen_style
|
||||
+ _queen_tools_independent
|
||||
+ _queen_behavior_always
|
||||
+ _queen_behavior_independent
|
||||
phase_state.prompt_independent = finalize_queen_prompt(
|
||||
(
|
||||
_queen_character_core
|
||||
+ _queen_role_independent
|
||||
+ _queen_style
|
||||
+ _queen_tools_independent
|
||||
+ _queen_behavior_always
|
||||
+ _queen_behavior_independent
|
||||
),
|
||||
_has_vision,
|
||||
)
|
||||
|
||||
# ---- Default skill protocols -------------------------------------
|
||||
|
||||
@@ -4,7 +4,8 @@ from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from framework.llm.capabilities import supports_image_tool_results
|
||||
from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results
|
||||
from framework.llm.provider import Tool
|
||||
|
||||
|
||||
class TestSupportsImageToolResults:
|
||||
@@ -56,3 +57,56 @@ class TestSupportsImageToolResults:
|
||||
assert supports_image_tool_results("DeepSeek/deepseek-chat") is False
|
||||
assert supports_image_tool_results("OLLAMA/llama3") is False
|
||||
assert supports_image_tool_results("GPT-4o") is True
|
||||
|
||||
|
||||
class TestFilterToolsForModel:
|
||||
"""Verify ``filter_tools_for_model`` — the real helper used by AgentLoop."""
|
||||
|
||||
def test_hides_image_tool_from_text_only_model(self):
|
||||
tools = [
|
||||
Tool(name="read_file", description="read a file"),
|
||||
Tool(name="browser_screenshot", description="take a screenshot", produces_image=True),
|
||||
Tool(name="browser_snapshot", description="get page content"),
|
||||
]
|
||||
filtered, hidden = filter_tools_for_model(tools, "glm-5")
|
||||
names = [t.name for t in filtered]
|
||||
assert "browser_screenshot" not in names
|
||||
assert "read_file" in names
|
||||
assert "browser_snapshot" in names
|
||||
assert hidden == ["browser_screenshot"]
|
||||
|
||||
def test_keeps_image_tool_for_vision_model(self):
|
||||
tools = [
|
||||
Tool(name="read_file", description="read a file"),
|
||||
Tool(name="browser_screenshot", description="take a screenshot", produces_image=True),
|
||||
]
|
||||
filtered, hidden = filter_tools_for_model(tools, "claude-sonnet-4-20250514")
|
||||
assert {t.name for t in filtered} == {"read_file", "browser_screenshot"}
|
||||
assert hidden == []
|
||||
|
||||
def test_default_tools_are_not_filtered(self):
|
||||
"""Tools without produces_image (default False) are kept for all models."""
|
||||
tools = [
|
||||
Tool(name="read_file", description="read a file"),
|
||||
Tool(name="web_search", description="search the web"),
|
||||
]
|
||||
text_only, text_hidden = filter_tools_for_model(tools, "glm-5")
|
||||
vision, vision_hidden = filter_tools_for_model(tools, "gpt-4o")
|
||||
assert len(text_only) == 2 and text_hidden == []
|
||||
assert len(vision) == 2 and vision_hidden == []
|
||||
|
||||
def test_empty_model_string_returns_tools_unchanged(self):
|
||||
"""Guards the ctx.llm-missing path where model is empty."""
|
||||
tools = [
|
||||
Tool(name="browser_screenshot", description="", produces_image=True),
|
||||
]
|
||||
filtered, hidden = filter_tools_for_model(tools, "")
|
||||
assert len(filtered) == 1
|
||||
assert hidden == []
|
||||
|
||||
def test_returned_list_is_a_copy(self):
|
||||
"""Caller should be free to mutate the filtered list without affecting input."""
|
||||
tools = [Tool(name="read_file", description="")]
|
||||
filtered, _ = filter_tools_for_model(tools, "gpt-4o")
|
||||
filtered.append(Tool(name="extra", description=""))
|
||||
assert len(tools) == 1
|
||||
|
||||
@@ -0,0 +1,65 @@
|
||||
"""Tests for vision-only prompt block stripping in Queen nodes.
|
||||
|
||||
Covers ``finalize_queen_prompt`` — the function that resolves
|
||||
``<!-- vision-only -->...<!-- /vision-only -->`` markers in Queen phase
|
||||
prompts before they reach the LLM. Vision-capable models see the inner
|
||||
content; text-only models see the block removed entirely.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from framework.agents.queen.nodes import finalize_queen_prompt
|
||||
|
||||
|
||||
class TestFinalizeQueenPrompt:
|
||||
def test_vision_model_keeps_inner_content_and_strips_markers(self):
|
||||
text = "before <!-- vision-only -->secret<!-- /vision-only --> after"
|
||||
result = finalize_queen_prompt(text, has_vision=True)
|
||||
assert result == "before secret after"
|
||||
|
||||
def test_text_only_model_removes_entire_block(self):
|
||||
text = "before <!-- vision-only -->secret<!-- /vision-only --> after"
|
||||
result = finalize_queen_prompt(text, has_vision=False)
|
||||
assert result == "before after"
|
||||
assert "secret" not in result
|
||||
assert "vision-only" not in result
|
||||
|
||||
def test_multiline_block_handled(self):
|
||||
"""Regex must use DOTALL so blocks can span newlines."""
|
||||
text = (
|
||||
"- item 1\n"
|
||||
"<!-- vision-only -->\n"
|
||||
"- item 2 (vision only)\n"
|
||||
"<!-- /vision-only -->\n"
|
||||
"- item 3\n"
|
||||
)
|
||||
vision = finalize_queen_prompt(text, has_vision=True)
|
||||
text_only = finalize_queen_prompt(text, has_vision=False)
|
||||
assert "- item 2 (vision only)" in vision
|
||||
assert "- item 2 (vision only)" not in text_only
|
||||
assert "- item 1" in text_only and "- item 3" in text_only
|
||||
|
||||
def test_multiple_blocks_in_same_text(self):
|
||||
text = (
|
||||
"A <!-- vision-only -->X<!-- /vision-only --> "
|
||||
"B <!-- vision-only -->Y<!-- /vision-only --> C"
|
||||
)
|
||||
assert finalize_queen_prompt(text, has_vision=True) == "A X B Y C"
|
||||
assert finalize_queen_prompt(text, has_vision=False) == "A B C"
|
||||
|
||||
def test_non_greedy_match_does_not_swallow_between_blocks(self):
|
||||
"""A naïve greedy regex would match from the first opening marker
|
||||
to the last closing marker and wipe out the middle section. Lock
|
||||
that down so a future refactor can't regress to greedy."""
|
||||
text = (
|
||||
"<!-- vision-only -->first<!-- /vision-only -->"
|
||||
"KEEP"
|
||||
"<!-- vision-only -->second<!-- /vision-only -->"
|
||||
)
|
||||
assert finalize_queen_prompt(text, has_vision=False) == "KEEP"
|
||||
assert finalize_queen_prompt(text, has_vision=True) == "firstKEEPsecond"
|
||||
|
||||
def test_text_without_markers_is_unchanged(self):
|
||||
text = "plain prompt with no markers at all"
|
||||
assert finalize_queen_prompt(text, has_vision=True) == text
|
||||
assert finalize_queen_prompt(text, has_vision=False) == text
|
||||
@@ -797,3 +797,52 @@ def test_resync_returns_false_when_credentials_unchanged(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(registry, "_snapshot_credentials", lambda: set())
|
||||
|
||||
assert registry.resync_mcp_servers_if_needed() is False
|
||||
|
||||
|
||||
class TestMcpToolProducesImageFlag:
|
||||
"""Verify _convert_mcp_tool_to_framework_tool sets produces_image from the name.
|
||||
|
||||
This is the detection step that the filter in AgentLoop depends on —
|
||||
if the regex regresses, text-only models will start seeing screenshot
|
||||
tools they can't use.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _mcp_tool(name: str):
|
||||
return SimpleNamespace(
|
||||
name=name,
|
||||
description=f"{name} description",
|
||||
input_schema={"type": "object", "properties": {}, "required": []},
|
||||
server_name="test",
|
||||
)
|
||||
|
||||
def test_screenshot_flagged(self):
|
||||
registry = ToolRegistry()
|
||||
mcp = self._mcp_tool("browser_screenshot")
|
||||
tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001
|
||||
assert tool.produces_image is True
|
||||
|
||||
def test_snapshot_not_flagged(self):
|
||||
"""browser_snapshot returns a DOM tree, not an image — must not match."""
|
||||
registry = ToolRegistry()
|
||||
mcp = self._mcp_tool("browser_snapshot")
|
||||
tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001
|
||||
assert tool.produces_image is False
|
||||
|
||||
def test_case_insensitive_match(self):
|
||||
registry = ToolRegistry()
|
||||
mcp = self._mcp_tool("TakeScreenshot")
|
||||
tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001
|
||||
assert tool.produces_image is True
|
||||
|
||||
def test_plain_tool_not_flagged(self):
|
||||
registry = ToolRegistry()
|
||||
mcp = self._mcp_tool("read_file")
|
||||
tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001
|
||||
assert tool.produces_image is False
|
||||
|
||||
def test_image_suffix_variants_flagged(self):
|
||||
registry = ToolRegistry()
|
||||
for name in ("capture_image", "render_image", "get_image", "snapshot_image"):
|
||||
tool = registry._convert_mcp_tool_to_framework_tool(self._mcp_tool(name)) # noqa: SLF001
|
||||
assert tool.produces_image is True, f"{name} should be flagged"
|
||||
|
||||
Reference in New Issue
Block a user