feat: strip image content for non-vision models

2026-03-18 12:40:30 -07:00
parent 9dadb5264d
commit c0da3bec02
3 changed files with 104 additions and 1 deletions
@@ -24,6 +24,7 @@ from typing import Any, Literal, Protocol, runtime_checkable

 from framework.graph.conversation import ConversationStore, NodeConversation
 from framework.graph.node import NodeContext, NodeProtocol, NodeResult
+from framework.llm.capabilities import supports_image_tool_results
 from framework.llm.provider import Tool, ToolResult, ToolUse
 from framework.llm.stream_events import (
    FinishEvent,
@@ -2703,11 +2704,21 @@ class EventLoopNode(NodeProtocol):
                    real_tool_results.append(tool_entry)
                    logged_tool_calls.append(tool_entry)

+                # Strip image content for models that can't handle it
+                image_content = result.image_content
+                if image_content and ctx.llm and not supports_image_tool_results(ctx.llm.model):
+                    logger.info(
+                        "Stripping image_content from tool result — model '%s' "
+                        "does not support images in tool results",
+                        ctx.llm.model,
+                    )
+                    image_content = None
+
                await conversation.add_tool_result(
                    tool_use_id=tc.tool_use_id,
                    content=result.content,
                    is_error=result.is_error,
-                    image_content=result.image_content,
+                    image_content=image_content,
                )
                if tc.tool_name in ("ask_user", "ask_user_multiple"):
                    # Defer tool_call_completed until after user responds
@@ -0,0 +1,34 @@
+"""Model capability checks for LLM providers."""
+
+from __future__ import annotations
+
+# Prefixes of models/providers known to NOT support image content blocks
+# inside tool result messages.  We use a deny-list (rather than an allow-list)
+# because most OpenAI-compatible providers pass content lists through to the
+# API unchanged — only a few are known to silently strip or break on images.
+_IMAGE_TOOL_RESULT_DENY_PREFIXES: tuple[str, ...] = (
+    # DeepSeek: LiteLLM explicitly flattens all content lists to strings,
+    # silently dropping image blocks.
+    "deepseek/",
+    "deepseek-",
+    # Local model providers: most models lack vision support, and those that
+    # do typically handle images in user messages only, not tool results.
+    "ollama/",
+    "ollama_chat/",
+    "lm_studio/",
+    "vllm/",
+    "llamacpp/",
+    # Cerebras: no known vision/multimodal support.
+    "cerebras/",
+)
+
+
+def supports_image_tool_results(model: str) -> bool:
+    """Return whether *model* can receive image content in tool result messages.
+
+    Models on the deny-list are known to either silently strip images or lack
+    vision support entirely.  Everything else is assumed to work (OpenAI,
+    Anthropic, Gemini, Mistral, Groq, etc. all handle it correctly via LiteLLM).
+    """
+    model_lower = model.lower()
+    return not any(model_lower.startswith(prefix) for prefix in _IMAGE_TOOL_RESULT_DENY_PREFIXES)
@@ -0,0 +1,58 @@
+"""Tests for LLM model capability checks."""
+
+from __future__ import annotations
+
+import pytest
+
+from framework.llm.capabilities import supports_image_tool_results
+
+
+class TestSupportsImageToolResults:
+    """Verify the deny-list correctly identifies models that can't handle images."""
+
+    @pytest.mark.parametrize(
+        "model",
+        [
+            "gpt-4o",
+            "gpt-4o-mini",
+            "gpt-4-turbo",
+            "openai/gpt-4o",
+            "anthropic/claude-sonnet-4-20250514",
+            "claude-haiku-4-5-20251001",
+            "gemini/gemini-1.5-pro",
+            "google/gemini-1.5-flash",
+            "mistral/mistral-large",
+            "groq/llama3-70b",
+            "together/meta-llama/Llama-3-70b",
+            "fireworks_ai/llama-v3-70b",
+            "azure/gpt-4o",
+            "kimi/claude-sonnet-4-20250514",
+            "hive/claude-sonnet-4-20250514",
+        ],
+    )
+    def test_supported_models(self, model: str):
+        assert supports_image_tool_results(model) is True
+
+    @pytest.mark.parametrize(
+        "model",
+        [
+            "deepseek/deepseek-chat",
+            "deepseek/deepseek-coder",
+            "deepseek-chat",
+            "deepseek-reasoner",
+            "ollama/llama3",
+            "ollama/mistral",
+            "ollama_chat/llama3",
+            "lm_studio/my-model",
+            "vllm/meta-llama/Llama-3-70b",
+            "llamacpp/model",
+            "cerebras/llama3-70b",
+        ],
+    )
+    def test_unsupported_models(self, model: str):
+        assert supports_image_tool_results(model) is False
+
+    def test_case_insensitive(self):
+        assert supports_image_tool_results("DeepSeek/deepseek-chat") is False
+        assert supports_image_tool_results("OLLAMA/llama3") is False
+        assert supports_image_tool_results("GPT-4o") is True