fix(tests): unblock main CI (#7141)
Two unrelated test failures were keeping main red: - test_capabilities.py: fixtures referenced deprecated model identifiers no longer in model_catalog.json. After the catalog refactor unknown models default to vision-capable, so 12 "expect False" assertions flipped to True. Replace fixtures with current catalog entries that carry an explicit supports_vision flag. - test_colony_runtime_overseer.py: a 200ms hard sleep racing the background worker was flaky on Windows CI. Poll for llm.stream_calls with a 5s deadline instead.
This commit is contained in:
@@ -9,26 +9,25 @@ from framework.llm.provider import Tool
|
||||
|
||||
|
||||
class TestSupportsImageToolResults:
|
||||
"""Verify the deny-list correctly identifies models that can't handle images."""
|
||||
"""Verify catalog-driven vision capability checks."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"gpt-4o",
|
||||
"gpt-4o-mini",
|
||||
"gpt-4-turbo",
|
||||
"openai/gpt-4o",
|
||||
"anthropic/claude-sonnet-4-20250514",
|
||||
# Catalog entries with supports_vision=true
|
||||
"claude-haiku-4-5-20251001",
|
||||
"gemini/gemini-1.5-pro",
|
||||
"google/gemini-1.5-flash",
|
||||
"mistral/mistral-large",
|
||||
"groq/llama3-70b",
|
||||
"together/meta-llama/Llama-3-70b",
|
||||
"fireworks_ai/llama-v3-70b",
|
||||
"azure/gpt-4o",
|
||||
"kimi/claude-sonnet-4-20250514",
|
||||
"hive/claude-sonnet-4-20250514",
|
||||
"claude-sonnet-4-5-20250929",
|
||||
"claude-opus-4-6",
|
||||
"gpt-5.4",
|
||||
"gpt-5.4-mini",
|
||||
"gemini-3-flash-preview",
|
||||
"kimi-k2.5",
|
||||
# Provider-prefixed catalog entries
|
||||
"openrouter/openai/gpt-5.4",
|
||||
"openrouter/anthropic/claude-sonnet-4.6",
|
||||
# Unknown models default to True (hosted frontier assumption)
|
||||
"some-future-model",
|
||||
"azure/gpt-5",
|
||||
],
|
||||
)
|
||||
def test_supported_models(self, model: str):
|
||||
@@ -37,27 +36,24 @@ class TestSupportsImageToolResults:
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"deepseek/deepseek-chat",
|
||||
"deepseek/deepseek-coder",
|
||||
"deepseek-chat",
|
||||
# Catalog entries with supports_vision=false
|
||||
"deepseek-reasoner",
|
||||
"ollama/llama3",
|
||||
"ollama/mistral",
|
||||
"ollama_chat/llama3",
|
||||
"lm_studio/my-model",
|
||||
"vllm/meta-llama/Llama-3-70b",
|
||||
"llamacpp/model",
|
||||
"cerebras/llama3-70b",
|
||||
"deepseek-v4-pro",
|
||||
"deepseek-v4-flash",
|
||||
"glm-5.1",
|
||||
"queen",
|
||||
"MiniMax-M2.7",
|
||||
"codestral-2508",
|
||||
"llama-3.3-70b-versatile",
|
||||
# Provider-prefixed forms resolve to the same catalog entry
|
||||
"deepseek/deepseek-reasoner",
|
||||
"hive/glm-5.1",
|
||||
"groq/llama-3.3-70b-versatile",
|
||||
],
|
||||
)
|
||||
def test_unsupported_models(self, model: str):
|
||||
assert supports_image_tool_results(model) is False
|
||||
|
||||
def test_case_insensitive(self):
|
||||
assert supports_image_tool_results("DeepSeek/deepseek-chat") is False
|
||||
assert supports_image_tool_results("OLLAMA/llama3") is False
|
||||
assert supports_image_tool_results("GPT-4o") is True
|
||||
|
||||
|
||||
class TestFilterToolsForModel:
|
||||
"""Verify ``filter_tools_for_model`` — the real helper used by AgentLoop."""
|
||||
@@ -68,7 +64,7 @@ class TestFilterToolsForModel:
|
||||
Tool(name="browser_screenshot", description="take a screenshot", produces_image=True),
|
||||
Tool(name="browser_snapshot", description="get page content"),
|
||||
]
|
||||
filtered, hidden = filter_tools_for_model(tools, "glm-5")
|
||||
filtered, hidden = filter_tools_for_model(tools, "glm-5.1")
|
||||
names = [t.name for t in filtered]
|
||||
assert "browser_screenshot" not in names
|
||||
assert "read_file" in names
|
||||
@@ -80,7 +76,7 @@ class TestFilterToolsForModel:
|
||||
Tool(name="read_file", description="read a file"),
|
||||
Tool(name="browser_screenshot", description="take a screenshot", produces_image=True),
|
||||
]
|
||||
filtered, hidden = filter_tools_for_model(tools, "claude-sonnet-4-20250514")
|
||||
filtered, hidden = filter_tools_for_model(tools, "claude-sonnet-4-5-20250929")
|
||||
assert {t.name for t in filtered} == {"read_file", "browser_screenshot"}
|
||||
assert hidden == []
|
||||
|
||||
@@ -90,8 +86,8 @@ class TestFilterToolsForModel:
|
||||
Tool(name="read_file", description="read a file"),
|
||||
Tool(name="web_search", description="search the web"),
|
||||
]
|
||||
text_only, text_hidden = filter_tools_for_model(tools, "glm-5")
|
||||
vision, vision_hidden = filter_tools_for_model(tools, "gpt-4o")
|
||||
text_only, text_hidden = filter_tools_for_model(tools, "glm-5.1")
|
||||
vision, vision_hidden = filter_tools_for_model(tools, "claude-sonnet-4-5-20250929")
|
||||
assert len(text_only) == 2 and text_hidden == []
|
||||
assert len(vision) == 2 and vision_hidden == []
|
||||
|
||||
|
||||
@@ -486,8 +486,12 @@ class TestReportToParentGatingByStream:
|
||||
try:
|
||||
# Spawn a parallel worker — its tool list should include report_to_parent
|
||||
await colony.spawn(task="test", count=1)
|
||||
# After the worker's first LLM call, check the recorded tools
|
||||
await asyncio.sleep(0.2) # let the background task run
|
||||
# Poll until the worker fires its first LLM call. Bare sleeps were
|
||||
# flaky on slow Windows CI; loop with a generous deadline instead.
|
||||
for _ in range(100):
|
||||
if llm.stream_calls:
|
||||
break
|
||||
await asyncio.sleep(0.05)
|
||||
assert llm.stream_calls, "Worker never called the LLM"
|
||||
worker_tools = llm.stream_calls[0]["tools"]
|
||||
tool_names = [t.name for t in (worker_tools or [])]
|
||||
|
||||
Reference in New Issue
Block a user