fix(tests): unblock main CI (#7141)

Two unrelated test failures were keeping main red:

- test_capabilities.py: fixtures referenced deprecated model identifiers
  no longer in model_catalog.json. After the catalog refactor unknown
  models default to vision-capable, so 12 "expect False" assertions
  flipped to True. Replace fixtures with current catalog entries that
  carry an explicit supports_vision flag.

- test_colony_runtime_overseer.py: a 200ms hard sleep racing the
  background worker was flaky on Windows CI. Poll for llm.stream_calls
  with a 5s deadline instead.
This commit is contained in:
Hundao
2026-04-26 21:34:21 +08:00
committed by GitHub
parent ea707438f2
commit de8d6f0946
2 changed files with 36 additions and 36 deletions
+30 -34
View File
@@ -9,26 +9,25 @@ from framework.llm.provider import Tool
class TestSupportsImageToolResults:
"""Verify the deny-list correctly identifies models that can't handle images."""
"""Verify catalog-driven vision capability checks."""
@pytest.mark.parametrize(
"model",
[
"gpt-4o",
"gpt-4o-mini",
"gpt-4-turbo",
"openai/gpt-4o",
"anthropic/claude-sonnet-4-20250514",
# Catalog entries with supports_vision=true
"claude-haiku-4-5-20251001",
"gemini/gemini-1.5-pro",
"google/gemini-1.5-flash",
"mistral/mistral-large",
"groq/llama3-70b",
"together/meta-llama/Llama-3-70b",
"fireworks_ai/llama-v3-70b",
"azure/gpt-4o",
"kimi/claude-sonnet-4-20250514",
"hive/claude-sonnet-4-20250514",
"claude-sonnet-4-5-20250929",
"claude-opus-4-6",
"gpt-5.4",
"gpt-5.4-mini",
"gemini-3-flash-preview",
"kimi-k2.5",
# Provider-prefixed catalog entries
"openrouter/openai/gpt-5.4",
"openrouter/anthropic/claude-sonnet-4.6",
# Unknown models default to True (hosted frontier assumption)
"some-future-model",
"azure/gpt-5",
],
)
def test_supported_models(self, model: str):
@@ -37,27 +36,24 @@ class TestSupportsImageToolResults:
@pytest.mark.parametrize(
"model",
[
"deepseek/deepseek-chat",
"deepseek/deepseek-coder",
"deepseek-chat",
# Catalog entries with supports_vision=false
"deepseek-reasoner",
"ollama/llama3",
"ollama/mistral",
"ollama_chat/llama3",
"lm_studio/my-model",
"vllm/meta-llama/Llama-3-70b",
"llamacpp/model",
"cerebras/llama3-70b",
"deepseek-v4-pro",
"deepseek-v4-flash",
"glm-5.1",
"queen",
"MiniMax-M2.7",
"codestral-2508",
"llama-3.3-70b-versatile",
# Provider-prefixed forms resolve to the same catalog entry
"deepseek/deepseek-reasoner",
"hive/glm-5.1",
"groq/llama-3.3-70b-versatile",
],
)
def test_unsupported_models(self, model: str):
assert supports_image_tool_results(model) is False
def test_case_insensitive(self):
assert supports_image_tool_results("DeepSeek/deepseek-chat") is False
assert supports_image_tool_results("OLLAMA/llama3") is False
assert supports_image_tool_results("GPT-4o") is True
class TestFilterToolsForModel:
"""Verify ``filter_tools_for_model`` — the real helper used by AgentLoop."""
@@ -68,7 +64,7 @@ class TestFilterToolsForModel:
Tool(name="browser_screenshot", description="take a screenshot", produces_image=True),
Tool(name="browser_snapshot", description="get page content"),
]
filtered, hidden = filter_tools_for_model(tools, "glm-5")
filtered, hidden = filter_tools_for_model(tools, "glm-5.1")
names = [t.name for t in filtered]
assert "browser_screenshot" not in names
assert "read_file" in names
@@ -80,7 +76,7 @@ class TestFilterToolsForModel:
Tool(name="read_file", description="read a file"),
Tool(name="browser_screenshot", description="take a screenshot", produces_image=True),
]
filtered, hidden = filter_tools_for_model(tools, "claude-sonnet-4-20250514")
filtered, hidden = filter_tools_for_model(tools, "claude-sonnet-4-5-20250929")
assert {t.name for t in filtered} == {"read_file", "browser_screenshot"}
assert hidden == []
@@ -90,8 +86,8 @@ class TestFilterToolsForModel:
Tool(name="read_file", description="read a file"),
Tool(name="web_search", description="search the web"),
]
text_only, text_hidden = filter_tools_for_model(tools, "glm-5")
vision, vision_hidden = filter_tools_for_model(tools, "gpt-4o")
text_only, text_hidden = filter_tools_for_model(tools, "glm-5.1")
vision, vision_hidden = filter_tools_for_model(tools, "claude-sonnet-4-5-20250929")
assert len(text_only) == 2 and text_hidden == []
assert len(vision) == 2 and vision_hidden == []
+6 -2
View File
@@ -486,8 +486,12 @@ class TestReportToParentGatingByStream:
try:
# Spawn a parallel worker — its tool list should include report_to_parent
await colony.spawn(task="test", count=1)
# After the worker's first LLM call, check the recorded tools
await asyncio.sleep(0.2) # let the background task run
# Poll until the worker fires its first LLM call. Bare sleeps were
# flaky on slow Windows CI; loop with a generous deadline instead.
for _ in range(100):
if llm.stream_calls:
break
await asyncio.sleep(0.05)
assert llm.stream_calls, "Worker never called the LLM"
worker_tools = llm.stream_calls[0]["tools"]
tool_names = [t.name for t in (worker_tools or [])]