feat: strip image content for non-vision models
This commit is contained in:
@@ -24,6 +24,7 @@ from typing import Any, Literal, Protocol, runtime_checkable
|
||||
|
||||
from framework.graph.conversation import ConversationStore, NodeConversation
|
||||
from framework.graph.node import NodeContext, NodeProtocol, NodeResult
|
||||
from framework.llm.capabilities import supports_image_tool_results
|
||||
from framework.llm.provider import Tool, ToolResult, ToolUse
|
||||
from framework.llm.stream_events import (
|
||||
FinishEvent,
|
||||
@@ -2703,11 +2704,21 @@ class EventLoopNode(NodeProtocol):
|
||||
real_tool_results.append(tool_entry)
|
||||
logged_tool_calls.append(tool_entry)
|
||||
|
||||
# Strip image content for models that can't handle it
|
||||
image_content = result.image_content
|
||||
if image_content and ctx.llm and not supports_image_tool_results(ctx.llm.model):
|
||||
logger.info(
|
||||
"Stripping image_content from tool result — model '%s' "
|
||||
"does not support images in tool results",
|
||||
ctx.llm.model,
|
||||
)
|
||||
image_content = None
|
||||
|
||||
await conversation.add_tool_result(
|
||||
tool_use_id=tc.tool_use_id,
|
||||
content=result.content,
|
||||
is_error=result.is_error,
|
||||
image_content=result.image_content,
|
||||
image_content=image_content,
|
||||
)
|
||||
if tc.tool_name in ("ask_user", "ask_user_multiple"):
|
||||
# Defer tool_call_completed until after user responds
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
"""Model capability checks for LLM providers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
# Prefixes of models/providers known to NOT support image content blocks
|
||||
# inside tool result messages. We use a deny-list (rather than an allow-list)
|
||||
# because most OpenAI-compatible providers pass content lists through to the
|
||||
# API unchanged — only a few are known to silently strip or break on images.
|
||||
_IMAGE_TOOL_RESULT_DENY_PREFIXES: tuple[str, ...] = (
|
||||
# DeepSeek: LiteLLM explicitly flattens all content lists to strings,
|
||||
# silently dropping image blocks.
|
||||
"deepseek/",
|
||||
"deepseek-",
|
||||
# Local model providers: most models lack vision support, and those that
|
||||
# do typically handle images in user messages only, not tool results.
|
||||
"ollama/",
|
||||
"ollama_chat/",
|
||||
"lm_studio/",
|
||||
"vllm/",
|
||||
"llamacpp/",
|
||||
# Cerebras: no known vision/multimodal support.
|
||||
"cerebras/",
|
||||
)
|
||||
|
||||
|
||||
def supports_image_tool_results(model: str) -> bool:
|
||||
"""Return whether *model* can receive image content in tool result messages.
|
||||
|
||||
Models on the deny-list are known to either silently strip images or lack
|
||||
vision support entirely. Everything else is assumed to work (OpenAI,
|
||||
Anthropic, Gemini, Mistral, Groq, etc. all handle it correctly via LiteLLM).
|
||||
"""
|
||||
model_lower = model.lower()
|
||||
return not any(model_lower.startswith(prefix) for prefix in _IMAGE_TOOL_RESULT_DENY_PREFIXES)
|
||||
@@ -0,0 +1,58 @@
|
||||
"""Tests for LLM model capability checks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from framework.llm.capabilities import supports_image_tool_results
|
||||
|
||||
|
||||
class TestSupportsImageToolResults:
|
||||
"""Verify the deny-list correctly identifies models that can't handle images."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"gpt-4o",
|
||||
"gpt-4o-mini",
|
||||
"gpt-4-turbo",
|
||||
"openai/gpt-4o",
|
||||
"anthropic/claude-sonnet-4-20250514",
|
||||
"claude-haiku-4-5-20251001",
|
||||
"gemini/gemini-1.5-pro",
|
||||
"google/gemini-1.5-flash",
|
||||
"mistral/mistral-large",
|
||||
"groq/llama3-70b",
|
||||
"together/meta-llama/Llama-3-70b",
|
||||
"fireworks_ai/llama-v3-70b",
|
||||
"azure/gpt-4o",
|
||||
"kimi/claude-sonnet-4-20250514",
|
||||
"hive/claude-sonnet-4-20250514",
|
||||
],
|
||||
)
|
||||
def test_supported_models(self, model: str):
|
||||
assert supports_image_tool_results(model) is True
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"deepseek/deepseek-chat",
|
||||
"deepseek/deepseek-coder",
|
||||
"deepseek-chat",
|
||||
"deepseek-reasoner",
|
||||
"ollama/llama3",
|
||||
"ollama/mistral",
|
||||
"ollama_chat/llama3",
|
||||
"lm_studio/my-model",
|
||||
"vllm/meta-llama/Llama-3-70b",
|
||||
"llamacpp/model",
|
||||
"cerebras/llama3-70b",
|
||||
],
|
||||
)
|
||||
def test_unsupported_models(self, model: str):
|
||||
assert supports_image_tool_results(model) is False
|
||||
|
||||
def test_case_insensitive(self):
|
||||
assert supports_image_tool_results("DeepSeek/deepseek-chat") is False
|
||||
assert supports_image_tool_results("OLLAMA/llama3") is False
|
||||
assert supports_image_tool_results("GPT-4o") is True
|
||||
Reference in New Issue
Block a user