Merge remote-tracking branch 'origin/feat/text-only-tool-filter' into feature/new-colony

This commit is contained in:
Richard Tang
2026-04-14 16:29:19 -07:00
10 changed files with 298 additions and 37 deletions
+9 -2
View File
@@ -87,7 +87,7 @@ from framework.agent_loop.internals.types import (
)
from framework.agent_loop.types import AgentContext, AgentProtocol, AgentResult
from framework.host.event_bus import EventBus
from framework.llm.capabilities import supports_image_tool_results
from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results
from framework.llm.provider import Tool, ToolResult, ToolUse
from framework.llm.stream_events import (
FinishEvent,
@@ -562,13 +562,20 @@ class AgentLoop(AgentProtocol):
if isinstance(stream_id, str) and stream_id.startswith("worker:"):
tools.append(build_report_to_parent_tool())
# Hide image-producing tools from text-only models so they never try
# to call them. Avoids wasted turns + "screenshot failed" lessons
# getting saved to memory. See framework.llm.capabilities.
_llm_model = ctx.llm.model if ctx.llm else ""
tools, _hidden_image_tools = filter_tools_for_model(tools, _llm_model)
logger.info(
"[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s",
"[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s | hidden_image_tools=%s",
node_id,
len(tools),
[t.name for t in tools],
ctx.supports_direct_user_io,
type(self._judge).__name__ if self._judge else "None",
_hidden_image_tools,
)
# 4. Publish loop started
+26 -2
View File
@@ -1,5 +1,6 @@
"""Node definitions for Queen agent."""
import re
from pathlib import Path
from framework.orchestrator import NodeSpec
@@ -32,6 +33,29 @@ def _build_appendices() -> str:
return parts
# Wraps prompt sections that should only be shown to vision-capable models.
# Content inside `<!-- vision-only -->...<!-- /vision-only -->` is kept for
# vision models and stripped for text-only models. Applied once per session
# in queen_orchestrator.create_queen.
_VISION_ONLY_BLOCK_RE = re.compile(
r"<!-- vision-only -->(.*?)<!-- /vision-only -->",
re.DOTALL,
)
def finalize_queen_prompt(text: str, has_vision: bool) -> str:
"""Resolve `<!-- vision-only -->` blocks based on model capability.
For vision-capable models the markers are stripped and the inner
content is kept. For text-only models the whole block (markers +
content) is removed so the queen is never nudged toward tools it
cannot usefully invoke.
"""
if has_vision:
return _VISION_ONLY_BLOCK_RE.sub(r"\1", text)
return _VISION_ONLY_BLOCK_RE.sub("", text)
# Shared appendices — appended to every coding node's system prompt.
_appendices = _build_appendices()
@@ -502,7 +526,7 @@ The queen writes final production-ready system prompts directly.
MCP servers are loaded from the global registry by name. Available servers:
- `hive_tools` web search, email, CRM, calendar, 100+ integrations
- `gcu-tools` browser automation (click, type, navigate, screenshot)
- `gcu-tools` browser automation (click, type, navigate<!-- vision-only -->, screenshot<!-- /vision-only -->)
- `files-tools` file I/O (read, write, edit, search, list)
**Template variables:** Add a `variables:` section at the top of agent.json \
@@ -807,7 +831,7 @@ search_files, run_command, undo_changes
## Browser Automation (gcu-tools MCP)
All browser tools are prefixed with `browser_` (browser_start, browser_navigate, \
browser_click, browser_fill, browser_snapshot, browser_screenshot, browser_scroll, \
browser_click, browser_fill, browser_snapshot, <!-- vision-only -->browser_screenshot, <!-- /vision-only -->browser_scroll, \
browser_tabs, browser_close, browser_evaluate, etc.).
Follow the browser-automation skill protocol activate it before using browser tools.
@@ -20,7 +20,9 @@ All tools are prefixed with `browser_`:
- `browser_start`, `browser_open` -- launch/navigate
- `browser_click`, `browser_fill`, `browser_type` -- interact
- `browser_snapshot` -- read page content (preferred over screenshot)
<!-- vision-only -->
- `browser_screenshot` -- visual capture
<!-- /vision-only -->
- `browser_scroll`, `browser_wait` -- navigation helpers
- `browser_evaluate` -- run JavaScript
+24
View File
@@ -12,6 +12,11 @@ Vision support rules are derived from official vendor documentation:
from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from framework.llm.provider import Tool
def _model_name(model: str) -> str:
"""Return the bare model name after stripping any 'provider/' prefix."""
@@ -104,3 +109,22 @@ def supports_image_tool_results(model: str) -> bool:
# 5. Default: assume vision capable
# Covers: OpenAI, Anthropic, Google, Mistral, Kimi, and other hosted providers
return True
def filter_tools_for_model(tools: list[Tool], model: str) -> tuple[list[Tool], list[str]]:
"""Drop image-producing tools for text-only models.
Returns ``(filtered_tools, hidden_names)``. For vision-capable models
(or when *model* is empty) the input list is returned unchanged and
``hidden_names`` is empty. For text-only models any tool with
``produces_image=True`` is removed so the LLM never sees it in its
schema avoids wasted calls and stale "screenshot failed" entries
in agent memory.
"""
if not model or supports_image_tool_results(model):
return list(tools), []
hidden = [t.name for t in tools if t.produces_image]
if not hidden:
return list(tools), []
kept = [t for t in tools if not t.produces_image]
return kept, hidden
+3
View File
@@ -27,6 +27,9 @@ class Tool:
name: str
description: str
parameters: dict[str, Any] = field(default_factory=dict)
# If True, the tool may return ImageContent in its result. Text-only models
# (e.g. glm-5, deepseek-chat) have this hidden from their schema entirely.
produces_image: bool = False
@dataclass
+12
View File
@@ -7,6 +7,7 @@ import inspect
import json
import logging
import os
import re
from collections.abc import Callable
from dataclasses import dataclass
from pathlib import Path
@@ -18,6 +19,16 @@ logger = logging.getLogger(__name__)
_INPUT_LOG_MAX_LEN = 500
# Tools whose names match this pattern are assumed to return ImageContent.
# Matched against the bare tool name (case-insensitive). Used to mark MCP
# tools with produces_image=True so they can be filtered out for text-only
# models before the schema is ever shown to the LLM (avoids wasted calls
# and "screenshot failed" entries polluting memory).
_IMAGE_TOOL_NAME_RE = re.compile(
r"(screenshot|screen_capture|capture_image|render_image|get_image|snapshot_image)",
re.IGNORECASE,
)
# Per-execution context overrides. Each asyncio task (and thus each
# concurrent graph execution) gets its own copy, so there are no races
# when multiple ExecutionStreams run in parallel.
@@ -925,6 +936,7 @@ class ToolRegistry:
"properties": properties,
"required": required,
},
produces_image=bool(_IMAGE_TOOL_NAME_RE.search(mcp_tool.name or "")),
)
return tool
+53 -32
View File
@@ -311,7 +311,9 @@ async def create_queen(
_queen_tools_running,
_queen_tools_staging,
_shared_building_knowledge,
finalize_queen_prompt,
)
from framework.llm.capabilities import supports_image_tool_results
from framework.host.event_bus import AgentEvent, EventType
from framework.loader.mcp_registry import MCPRegistry
from framework.loader.tool_registry import ToolRegistry
@@ -489,6 +491,13 @@ async def create_queen(
"according to your current phase."
)
# Resolve vision-only prompt sections based on the session's LLM.
# session.llm is immutable for the session's lifetime, so this check
# is stable — prompts never need to be recomposed mid-session.
_has_vision = bool(
session.llm and supports_image_tool_results(getattr(session.llm, "model", ""))
)
_planning_body = (
_queen_character_core
+ _queen_role_planning
@@ -500,7 +509,7 @@ async def create_queen(
+ _planning_knowledge
+ worker_identity
)
phase_state.prompt_planning = _planning_body
phase_state.prompt_planning = finalize_queen_prompt(_planning_body, _has_vision)
_building_body = (
_queen_character_core
@@ -515,40 +524,52 @@ async def create_queen(
+ _appendices
+ worker_identity
)
phase_state.prompt_building = _building_body
phase_state.prompt_staging = (
_queen_character_core
+ _queen_role_staging
+ _queen_style
+ _queen_tools_staging
+ _queen_behavior_always
+ _queen_behavior_staging
+ worker_identity
phase_state.prompt_building = finalize_queen_prompt(_building_body, _has_vision)
phase_state.prompt_staging = finalize_queen_prompt(
(
_queen_character_core
+ _queen_role_staging
+ _queen_style
+ _queen_tools_staging
+ _queen_behavior_always
+ _queen_behavior_staging
+ worker_identity
),
_has_vision,
)
phase_state.prompt_running = (
_queen_character_core
+ _queen_role_running
+ _queen_style
+ _queen_tools_running
+ _queen_behavior_always
+ _queen_behavior_running
+ worker_identity
phase_state.prompt_running = finalize_queen_prompt(
(
_queen_character_core
+ _queen_role_running
+ _queen_style
+ _queen_tools_running
+ _queen_behavior_always
+ _queen_behavior_running
+ worker_identity
),
_has_vision,
)
phase_state.prompt_editing = (
_queen_identity_editing
+ _queen_style
+ _queen_tools_editing
+ _queen_behavior_always
+ _queen_behavior_editing
+ worker_identity
phase_state.prompt_editing = finalize_queen_prompt(
(
_queen_identity_editing
+ _queen_style
+ _queen_tools_editing
+ _queen_behavior_always
+ _queen_behavior_editing
+ worker_identity
),
_has_vision,
)
phase_state.prompt_independent = (
_queen_character_core
+ _queen_role_independent
+ _queen_style
+ _queen_tools_independent
+ _queen_behavior_always
+ _queen_behavior_independent
phase_state.prompt_independent = finalize_queen_prompt(
(
_queen_character_core
+ _queen_role_independent
+ _queen_style
+ _queen_tools_independent
+ _queen_behavior_always
+ _queen_behavior_independent
),
_has_vision,
)
# ---- Default skill protocols -------------------------------------
+55 -1
View File
@@ -4,7 +4,8 @@ from __future__ import annotations
import pytest
from framework.llm.capabilities import supports_image_tool_results
from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results
from framework.llm.provider import Tool
class TestSupportsImageToolResults:
@@ -56,3 +57,56 @@ class TestSupportsImageToolResults:
assert supports_image_tool_results("DeepSeek/deepseek-chat") is False
assert supports_image_tool_results("OLLAMA/llama3") is False
assert supports_image_tool_results("GPT-4o") is True
class TestFilterToolsForModel:
"""Verify ``filter_tools_for_model`` — the real helper used by AgentLoop."""
def test_hides_image_tool_from_text_only_model(self):
tools = [
Tool(name="read_file", description="read a file"),
Tool(name="browser_screenshot", description="take a screenshot", produces_image=True),
Tool(name="browser_snapshot", description="get page content"),
]
filtered, hidden = filter_tools_for_model(tools, "glm-5")
names = [t.name for t in filtered]
assert "browser_screenshot" not in names
assert "read_file" in names
assert "browser_snapshot" in names
assert hidden == ["browser_screenshot"]
def test_keeps_image_tool_for_vision_model(self):
tools = [
Tool(name="read_file", description="read a file"),
Tool(name="browser_screenshot", description="take a screenshot", produces_image=True),
]
filtered, hidden = filter_tools_for_model(tools, "claude-sonnet-4-20250514")
assert {t.name for t in filtered} == {"read_file", "browser_screenshot"}
assert hidden == []
def test_default_tools_are_not_filtered(self):
"""Tools without produces_image (default False) are kept for all models."""
tools = [
Tool(name="read_file", description="read a file"),
Tool(name="web_search", description="search the web"),
]
text_only, text_hidden = filter_tools_for_model(tools, "glm-5")
vision, vision_hidden = filter_tools_for_model(tools, "gpt-4o")
assert len(text_only) == 2 and text_hidden == []
assert len(vision) == 2 and vision_hidden == []
def test_empty_model_string_returns_tools_unchanged(self):
"""Guards the ctx.llm-missing path where model is empty."""
tools = [
Tool(name="browser_screenshot", description="", produces_image=True),
]
filtered, hidden = filter_tools_for_model(tools, "")
assert len(filtered) == 1
assert hidden == []
def test_returned_list_is_a_copy(self):
"""Caller should be free to mutate the filtered list without affecting input."""
tools = [Tool(name="read_file", description="")]
filtered, _ = filter_tools_for_model(tools, "gpt-4o")
filtered.append(Tool(name="extra", description=""))
assert len(tools) == 1
+65
View File
@@ -0,0 +1,65 @@
"""Tests for vision-only prompt block stripping in Queen nodes.
Covers ``finalize_queen_prompt`` the function that resolves
``<!-- vision-only -->...<!-- /vision-only -->`` markers in Queen phase
prompts before they reach the LLM. Vision-capable models see the inner
content; text-only models see the block removed entirely.
"""
from __future__ import annotations
from framework.agents.queen.nodes import finalize_queen_prompt
class TestFinalizeQueenPrompt:
def test_vision_model_keeps_inner_content_and_strips_markers(self):
text = "before <!-- vision-only -->secret<!-- /vision-only --> after"
result = finalize_queen_prompt(text, has_vision=True)
assert result == "before secret after"
def test_text_only_model_removes_entire_block(self):
text = "before <!-- vision-only -->secret<!-- /vision-only --> after"
result = finalize_queen_prompt(text, has_vision=False)
assert result == "before after"
assert "secret" not in result
assert "vision-only" not in result
def test_multiline_block_handled(self):
"""Regex must use DOTALL so blocks can span newlines."""
text = (
"- item 1\n"
"<!-- vision-only -->\n"
"- item 2 (vision only)\n"
"<!-- /vision-only -->\n"
"- item 3\n"
)
vision = finalize_queen_prompt(text, has_vision=True)
text_only = finalize_queen_prompt(text, has_vision=False)
assert "- item 2 (vision only)" in vision
assert "- item 2 (vision only)" not in text_only
assert "- item 1" in text_only and "- item 3" in text_only
def test_multiple_blocks_in_same_text(self):
text = (
"A <!-- vision-only -->X<!-- /vision-only --> "
"B <!-- vision-only -->Y<!-- /vision-only --> C"
)
assert finalize_queen_prompt(text, has_vision=True) == "A X B Y C"
assert finalize_queen_prompt(text, has_vision=False) == "A B C"
def test_non_greedy_match_does_not_swallow_between_blocks(self):
"""A naïve greedy regex would match from the first opening marker
to the last closing marker and wipe out the middle section. Lock
that down so a future refactor can't regress to greedy."""
text = (
"<!-- vision-only -->first<!-- /vision-only -->"
"KEEP"
"<!-- vision-only -->second<!-- /vision-only -->"
)
assert finalize_queen_prompt(text, has_vision=False) == "KEEP"
assert finalize_queen_prompt(text, has_vision=True) == "firstKEEPsecond"
def test_text_without_markers_is_unchanged(self):
text = "plain prompt with no markers at all"
assert finalize_queen_prompt(text, has_vision=True) == text
assert finalize_queen_prompt(text, has_vision=False) == text
+49
View File
@@ -797,3 +797,52 @@ def test_resync_returns_false_when_credentials_unchanged(tmp_path, monkeypatch):
monkeypatch.setattr(registry, "_snapshot_credentials", lambda: set())
assert registry.resync_mcp_servers_if_needed() is False
class TestMcpToolProducesImageFlag:
"""Verify _convert_mcp_tool_to_framework_tool sets produces_image from the name.
This is the detection step that the filter in AgentLoop depends on
if the regex regresses, text-only models will start seeing screenshot
tools they can't use.
"""
@staticmethod
def _mcp_tool(name: str):
return SimpleNamespace(
name=name,
description=f"{name} description",
input_schema={"type": "object", "properties": {}, "required": []},
server_name="test",
)
def test_screenshot_flagged(self):
registry = ToolRegistry()
mcp = self._mcp_tool("browser_screenshot")
tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001
assert tool.produces_image is True
def test_snapshot_not_flagged(self):
"""browser_snapshot returns a DOM tree, not an image — must not match."""
registry = ToolRegistry()
mcp = self._mcp_tool("browser_snapshot")
tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001
assert tool.produces_image is False
def test_case_insensitive_match(self):
registry = ToolRegistry()
mcp = self._mcp_tool("TakeScreenshot")
tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001
assert tool.produces_image is True
def test_plain_tool_not_flagged(self):
registry = ToolRegistry()
mcp = self._mcp_tool("read_file")
tool = registry._convert_mcp_tool_to_framework_tool(mcp) # noqa: SLF001
assert tool.produces_image is False
def test_image_suffix_variants_flagged(self):
registry = ToolRegistry()
for name in ("capture_image", "render_image", "get_image", "snapshot_image"):
tool = registry._convert_mcp_tool_to_framework_tool(self._mcp_tool(name)) # noqa: SLF001
assert tool.produces_image is True, f"{name} should be flagged"