chore: log the vision fallback model usage

This commit is contained in:
Richard Tang
2026-04-29 13:04:48 -07:00
parent 5492366c31
commit 4794c8b816
3 changed files with 37 additions and 21 deletions
+23 -14
View File
@@ -182,8 +182,13 @@ def _strip_internal_tags_from_snapshot(snapshot: str) -> str:
return cleaned
async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> str | None:
"""Describe images using the best available vision model."""
async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> tuple[str, str] | None:
"""Describe images using the best available vision model.
Returns ``(description, model)`` on success the formatted
placeholder text plus the model id that produced it or ``None``
when every candidate fails or no API key is configured.
"""
import litellm
blocks: list[dict[str, Any]] = [
@@ -216,7 +221,7 @@ async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> str |
if description:
count = len(image_content)
label = "image" if count == 1 else f"{count} images"
return f"[{label} attached — description: {description}]"
return f"[{label} attached — description: {description}]", model
except Exception as exc:
logger.debug("Vision fallback model '%s' failed: %s", model, exc)
continue
@@ -247,21 +252,22 @@ def _vision_fallback_active(model: str | None) -> bool:
async def _captioning_chain(
intent: str,
image_content: list[dict[str, Any]],
) -> str | None:
) -> tuple[str, str] | None:
"""Two-stage caption chain used by the agent-loop tool-result hook.
Stage 1: configured ``vision_fallback`` model with intent + images.
Stage 2: generic-caption rotation (gpt-4o-mini claude-3-haiku
gemini-flash) when stage 1 is unconfigured or fails.
Returns the caption text or None if both stages fail. Caller is
responsible for the placeholder-on-None and the splice into the
persisted tool-result content.
Returns ``(caption, model)`` the caption text paired with the
model id that produced it or ``None`` if both stages fail.
Caller is responsible for the placeholder-on-None and the splice
into the persisted tool-result content.
"""
caption = await caption_tool_image(intent, image_content)
if not caption:
caption = await _describe_images_as_text(image_content)
return caption
result = await caption_tool_image(intent, image_content)
if not result:
result = await _describe_images_as_text(image_content)
return result
# Pattern for detecting context-window-exceeded errors across LLM providers.
@@ -3499,14 +3505,17 @@ class AgentLoop(AgentProtocol):
# this tool).
vision_fallback_marker: str | None = None
if image_content and tc.tool_use_id in caption_tasks:
caption = await caption_tasks.pop(tc.tool_use_id)
if caption:
caption_result = await caption_tasks.pop(tc.tool_use_id)
if caption_result:
caption, vision_model = caption_result
vision_fallback_marker = f"[vision-fallback caption]\n{caption}"
logger.info(
"vision_fallback: captioned %d image(s) for tool '%s' (model '%s' routed through fallback)",
"vision_fallback: captioned %d image(s) for tool '%s' "
"(main model '%s' routed through fallback model '%s')",
len(image_content),
tc.tool_name,
ctx.llm.model if ctx.llm else "?",
vision_model,
)
else:
vision_fallback_marker = "[image stripped — vision fallback exhausted]"
@@ -162,7 +162,7 @@ async def drain_injection_queue(
conversation: NodeConversation,
*,
ctx: NodeContext,
describe_images_as_text_fn: (Callable[[list[dict[str, Any]]], Awaitable[str | None]] | None) = None,
describe_images_as_text_fn: (Callable[[list[dict[str, Any]]], Awaitable[tuple[str, str] | None]] | None) = None,
) -> int:
"""Drain all pending injected events as user messages. Returns count."""
count = 0
@@ -185,10 +185,14 @@ async def drain_injection_queue(
ctx.llm.model,
)
if describe_images_as_text_fn is not None:
description = await describe_images_as_text_fn(image_content)
if description:
described = await describe_images_as_text_fn(image_content)
if described:
description, vision_model = described
content = f"{content}\n\n{description}" if content else description
logger.info("[drain] image described as text via vision fallback")
logger.info(
"[drain] image described as text via vision fallback (model '%s')",
vision_model,
)
else:
logger.info("[drain] no vision fallback available; images dropped")
image_content = None
@@ -156,10 +156,11 @@ async def caption_tool_image(
image_content: list[dict[str, Any]],
*,
timeout_s: float = 30.0,
) -> str | None:
) -> tuple[str, str] | None:
"""Caption the given images using the configured ``vision_fallback`` model.
Returns the model's text response on success, or ``None`` on any
Returns ``(caption, model)`` on success the model's text response
paired with the model id that produced it or ``None`` on any
failure (no config, no API key, timeout, exception, empty
response). Callers chain to the next stage of the fallback on None.
@@ -241,7 +242,9 @@ async def caption_tool_image(
except Exception:
pass
return caption
if caption is None:
return None
return caption, model
__all__ = ["caption_tool_image", "extract_intent_for_tool"]