feat: vision model retry and fallback
This commit is contained in:
@@ -14,7 +14,6 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
@@ -183,50 +182,20 @@ def _strip_internal_tags_from_snapshot(snapshot: str) -> str:
|
||||
|
||||
|
||||
async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> tuple[str, str] | None:
|
||||
"""Describe images using the best available vision model.
|
||||
"""Describe images for the injection-queue drain (no preceding tool call).
|
||||
|
||||
Returns ``(description, model)`` on success — the formatted
|
||||
placeholder text plus the model id that produced it — or ``None``
|
||||
when every candidate fails or no API key is configured.
|
||||
Wraps :func:`_captioning_chain` with a generic intent and returns
|
||||
the caption inside an ``[image attached — description: …]`` envelope
|
||||
so the injected text reads as image content rather than free-form
|
||||
prose.
|
||||
"""
|
||||
import litellm
|
||||
|
||||
blocks: list[dict[str, Any]] = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": (
|
||||
"Describe the following image(s) concisely but with enough detail "
|
||||
"that a text-only AI assistant can understand the content and context."
|
||||
),
|
||||
}
|
||||
]
|
||||
blocks.extend(image_content)
|
||||
|
||||
candidates: list[str] = []
|
||||
if os.environ.get("OPENAI_API_KEY"):
|
||||
candidates.append("gpt-4o-mini")
|
||||
if os.environ.get("ANTHROPIC_API_KEY"):
|
||||
candidates.append("claude-3-haiku-20240307")
|
||||
if os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY"):
|
||||
candidates.append("gemini/gemini-1.5-flash")
|
||||
|
||||
for model in candidates:
|
||||
try:
|
||||
response = await litellm.acompletion(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": blocks}],
|
||||
max_tokens=512,
|
||||
)
|
||||
description = (response.choices[0].message.content or "").strip()
|
||||
if description:
|
||||
count = len(image_content)
|
||||
label = "image" if count == 1 else f"{count} images"
|
||||
return f"[{label} attached — description: {description}]", model
|
||||
except Exception as exc:
|
||||
logger.debug("Vision fallback model '%s' failed: %s", model, exc)
|
||||
continue
|
||||
|
||||
return None
|
||||
intent = "Describe the attached image(s) so a text-only agent can understand them."
|
||||
result = await _captioning_chain(intent, image_content)
|
||||
if not result:
|
||||
return None
|
||||
description, model = result
|
||||
label = "image" if len(image_content) == 1 else f"{len(image_content)} images"
|
||||
return f"[{label} attached — description: {description}]", model
|
||||
|
||||
|
||||
def _vision_fallback_active(model: str | None) -> bool:
|
||||
@@ -253,21 +222,22 @@ async def _captioning_chain(
|
||||
intent: str,
|
||||
image_content: list[dict[str, Any]],
|
||||
) -> tuple[str, str] | None:
|
||||
"""Two-stage caption chain used by the agent-loop tool-result hook.
|
||||
"""Configured vision_fallback → retry → ``gemini/gemini-3-flash-preview``.
|
||||
|
||||
Stage 1: configured ``vision_fallback`` model with intent + images.
|
||||
Stage 2: generic-caption rotation (gpt-4o-mini → claude-3-haiku
|
||||
→ gemini-flash) when stage 1 is unconfigured or fails.
|
||||
|
||||
Returns ``(caption, model)`` — the caption text paired with the
|
||||
model id that produced it — or ``None`` if both stages fail.
|
||||
Caller is responsible for the placeholder-on-None and the splice
|
||||
into the persisted tool-result content.
|
||||
The Gemini override reuses the configured ``api_key`` / ``api_base``,
|
||||
so a Hive subscriber (whose token routes to a multi-model proxy)
|
||||
keeps coverage when their primary model glitches. Without
|
||||
configured creds litellm falls through to env-based Gemini auth;
|
||||
users with neither Hive nor a ``GEMINI_API_KEY`` simply lose the
|
||||
third try.
|
||||
"""
|
||||
result = await caption_tool_image(intent, image_content)
|
||||
if not result:
|
||||
result = await _describe_images_as_text(image_content)
|
||||
return result
|
||||
if result := await caption_tool_image(intent, image_content):
|
||||
return result
|
||||
logger.warning("vision_fallback failed; retrying configured model")
|
||||
if result := await caption_tool_image(intent, image_content):
|
||||
return result
|
||||
logger.warning("vision_fallback retry failed; trying gemini-3-flash-preview")
|
||||
return await caption_tool_image(intent, image_content, model_override="gemini/gemini-3-flash-preview")
|
||||
|
||||
|
||||
# Pattern for detecting context-window-exceeded errors across LLM providers.
|
||||
|
||||
@@ -18,9 +18,10 @@ This module provides:
|
||||
|
||||
Both helpers degrade silently — return ``None`` / a placeholder rather
|
||||
than raise — so a vision-fallback failure can never kill the main
|
||||
agent's run. The agent-loop call site is responsible for chaining
|
||||
through to the existing generic-caption rotation
|
||||
(``_describe_images_as_text``) on a None return.
|
||||
agent's run. The agent-loop call site retries the configured model
|
||||
once on a None return, then falls back to
|
||||
``gemini/gemini-3-flash-preview`` via the ``model_override`` parameter
|
||||
of :func:`caption_tool_image`.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -156,25 +157,30 @@ async def caption_tool_image(
|
||||
image_content: list[dict[str, Any]],
|
||||
*,
|
||||
timeout_s: float = 30.0,
|
||||
model_override: str | None = None,
|
||||
) -> tuple[str, str] | None:
|
||||
"""Caption the given images using the configured ``vision_fallback`` model.
|
||||
|
||||
Returns ``(caption, model)`` on success — the model's text response
|
||||
paired with the model id that produced it — or ``None`` on any
|
||||
failure (no config, no API key, timeout, exception, empty
|
||||
response). Callers chain to the next stage of the fallback on None.
|
||||
Returns ``(caption, model)`` on success or ``None`` on any failure
|
||||
(no config, no API key, timeout, exception, empty response).
|
||||
|
||||
Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn`` so the
|
||||
cost / latency / quality are auditable post-hoc, tagged with
|
||||
``execution_id="vision_fallback_subagent"``.
|
||||
``model_override`` swaps in a different litellm model id while
|
||||
keeping the configured ``vision_fallback`` ``api_key`` / ``api_base``
|
||||
untouched. That's deliberate: Hive subscribers configure
|
||||
``vision_fallback`` to point at the Hive proxy, which routes to
|
||||
multiple models including Gemini — so reusing the credentials lets
|
||||
a Gemini-3-flash override still work without a separate
|
||||
``GEMINI_API_KEY``. When no creds are configured, litellm falls
|
||||
back to env-var resolution.
|
||||
|
||||
Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn``.
|
||||
"""
|
||||
model = get_vision_fallback_model()
|
||||
model = model_override or get_vision_fallback_model()
|
||||
if not model:
|
||||
return None
|
||||
|
||||
api_key = get_vision_fallback_api_key()
|
||||
api_base = get_vision_fallback_api_base()
|
||||
if not api_key:
|
||||
if not api_key and not model_override:
|
||||
logger.debug("vision_fallback configured but no API key resolved; skipping")
|
||||
return None
|
||||
|
||||
@@ -195,8 +201,9 @@ async def caption_tool_image(
|
||||
"messages": messages,
|
||||
"max_tokens": 1024,
|
||||
"timeout": timeout_s,
|
||||
"api_key": api_key,
|
||||
}
|
||||
if api_key:
|
||||
kwargs["api_key"] = api_key
|
||||
if api_base:
|
||||
kwargs["api_base"] = api_base
|
||||
|
||||
|
||||
@@ -162,9 +162,10 @@ def get_vision_fallback_model() -> str | None:
|
||||
Used by the agent-loop hook that captions tool-result images when the
|
||||
main agent's model cannot accept image content (text-only LLMs).
|
||||
|
||||
When this returns None the fallback chain skips the configured-subagent
|
||||
stage and proceeds straight to the generic caption rotation
|
||||
(``_describe_images_as_text``).
|
||||
When this returns None the captioning chain's configured + retry
|
||||
attempts both no-op (returning None), and only the final
|
||||
``gemini/gemini-3-flash-preview`` override has a chance to succeed
|
||||
— and only if a ``GEMINI_API_KEY`` is set in the environment.
|
||||
"""
|
||||
vision = get_hive_config().get("vision_fallback", {})
|
||||
if vision.get("provider") and vision.get("model"):
|
||||
|
||||
Reference in New Issue
Block a user