feat: vision model retry and fallback
This commit is contained in:
@@ -14,7 +14,6 @@ from __future__ import annotations
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
@@ -183,50 +182,20 @@ def _strip_internal_tags_from_snapshot(snapshot: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> tuple[str, str] | None:
|
async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> tuple[str, str] | None:
|
||||||
"""Describe images using the best available vision model.
|
"""Describe images for the injection-queue drain (no preceding tool call).
|
||||||
|
|
||||||
Returns ``(description, model)`` on success — the formatted
|
Wraps :func:`_captioning_chain` with a generic intent and returns
|
||||||
placeholder text plus the model id that produced it — or ``None``
|
the caption inside an ``[image attached — description: …]`` envelope
|
||||||
when every candidate fails or no API key is configured.
|
so the injected text reads as image content rather than free-form
|
||||||
|
prose.
|
||||||
"""
|
"""
|
||||||
import litellm
|
intent = "Describe the attached image(s) so a text-only agent can understand them."
|
||||||
|
result = await _captioning_chain(intent, image_content)
|
||||||
blocks: list[dict[str, Any]] = [
|
if not result:
|
||||||
{
|
return None
|
||||||
"type": "text",
|
description, model = result
|
||||||
"text": (
|
label = "image" if len(image_content) == 1 else f"{len(image_content)} images"
|
||||||
"Describe the following image(s) concisely but with enough detail "
|
return f"[{label} attached — description: {description}]", model
|
||||||
"that a text-only AI assistant can understand the content and context."
|
|
||||||
),
|
|
||||||
}
|
|
||||||
]
|
|
||||||
blocks.extend(image_content)
|
|
||||||
|
|
||||||
candidates: list[str] = []
|
|
||||||
if os.environ.get("OPENAI_API_KEY"):
|
|
||||||
candidates.append("gpt-4o-mini")
|
|
||||||
if os.environ.get("ANTHROPIC_API_KEY"):
|
|
||||||
candidates.append("claude-3-haiku-20240307")
|
|
||||||
if os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY"):
|
|
||||||
candidates.append("gemini/gemini-1.5-flash")
|
|
||||||
|
|
||||||
for model in candidates:
|
|
||||||
try:
|
|
||||||
response = await litellm.acompletion(
|
|
||||||
model=model,
|
|
||||||
messages=[{"role": "user", "content": blocks}],
|
|
||||||
max_tokens=512,
|
|
||||||
)
|
|
||||||
description = (response.choices[0].message.content or "").strip()
|
|
||||||
if description:
|
|
||||||
count = len(image_content)
|
|
||||||
label = "image" if count == 1 else f"{count} images"
|
|
||||||
return f"[{label} attached — description: {description}]", model
|
|
||||||
except Exception as exc:
|
|
||||||
logger.debug("Vision fallback model '%s' failed: %s", model, exc)
|
|
||||||
continue
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _vision_fallback_active(model: str | None) -> bool:
|
def _vision_fallback_active(model: str | None) -> bool:
|
||||||
@@ -253,21 +222,22 @@ async def _captioning_chain(
|
|||||||
intent: str,
|
intent: str,
|
||||||
image_content: list[dict[str, Any]],
|
image_content: list[dict[str, Any]],
|
||||||
) -> tuple[str, str] | None:
|
) -> tuple[str, str] | None:
|
||||||
"""Two-stage caption chain used by the agent-loop tool-result hook.
|
"""Configured vision_fallback → retry → ``gemini/gemini-3-flash-preview``.
|
||||||
|
|
||||||
Stage 1: configured ``vision_fallback`` model with intent + images.
|
The Gemini override reuses the configured ``api_key`` / ``api_base``,
|
||||||
Stage 2: generic-caption rotation (gpt-4o-mini → claude-3-haiku
|
so a Hive subscriber (whose token routes to a multi-model proxy)
|
||||||
→ gemini-flash) when stage 1 is unconfigured or fails.
|
keeps coverage when their primary model glitches. Without
|
||||||
|
configured creds litellm falls through to env-based Gemini auth;
|
||||||
Returns ``(caption, model)`` — the caption text paired with the
|
users with neither Hive nor a ``GEMINI_API_KEY`` simply lose the
|
||||||
model id that produced it — or ``None`` if both stages fail.
|
third try.
|
||||||
Caller is responsible for the placeholder-on-None and the splice
|
|
||||||
into the persisted tool-result content.
|
|
||||||
"""
|
"""
|
||||||
result = await caption_tool_image(intent, image_content)
|
if result := await caption_tool_image(intent, image_content):
|
||||||
if not result:
|
return result
|
||||||
result = await _describe_images_as_text(image_content)
|
logger.warning("vision_fallback failed; retrying configured model")
|
||||||
return result
|
if result := await caption_tool_image(intent, image_content):
|
||||||
|
return result
|
||||||
|
logger.warning("vision_fallback retry failed; trying gemini-3-flash-preview")
|
||||||
|
return await caption_tool_image(intent, image_content, model_override="gemini/gemini-3-flash-preview")
|
||||||
|
|
||||||
|
|
||||||
# Pattern for detecting context-window-exceeded errors across LLM providers.
|
# Pattern for detecting context-window-exceeded errors across LLM providers.
|
||||||
|
|||||||
@@ -18,9 +18,10 @@ This module provides:
|
|||||||
|
|
||||||
Both helpers degrade silently — return ``None`` / a placeholder rather
|
Both helpers degrade silently — return ``None`` / a placeholder rather
|
||||||
than raise — so a vision-fallback failure can never kill the main
|
than raise — so a vision-fallback failure can never kill the main
|
||||||
agent's run. The agent-loop call site is responsible for chaining
|
agent's run. The agent-loop call site retries the configured model
|
||||||
through to the existing generic-caption rotation
|
once on a None return, then falls back to
|
||||||
(``_describe_images_as_text``) on a None return.
|
``gemini/gemini-3-flash-preview`` via the ``model_override`` parameter
|
||||||
|
of :func:`caption_tool_image`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@@ -156,25 +157,30 @@ async def caption_tool_image(
|
|||||||
image_content: list[dict[str, Any]],
|
image_content: list[dict[str, Any]],
|
||||||
*,
|
*,
|
||||||
timeout_s: float = 30.0,
|
timeout_s: float = 30.0,
|
||||||
|
model_override: str | None = None,
|
||||||
) -> tuple[str, str] | None:
|
) -> tuple[str, str] | None:
|
||||||
"""Caption the given images using the configured ``vision_fallback`` model.
|
"""Caption the given images using the configured ``vision_fallback`` model.
|
||||||
|
|
||||||
Returns ``(caption, model)`` on success — the model's text response
|
Returns ``(caption, model)`` on success or ``None`` on any failure
|
||||||
paired with the model id that produced it — or ``None`` on any
|
(no config, no API key, timeout, exception, empty response).
|
||||||
failure (no config, no API key, timeout, exception, empty
|
|
||||||
response). Callers chain to the next stage of the fallback on None.
|
|
||||||
|
|
||||||
Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn`` so the
|
``model_override`` swaps in a different litellm model id while
|
||||||
cost / latency / quality are auditable post-hoc, tagged with
|
keeping the configured ``vision_fallback`` ``api_key`` / ``api_base``
|
||||||
``execution_id="vision_fallback_subagent"``.
|
untouched. That's deliberate: Hive subscribers configure
|
||||||
|
``vision_fallback`` to point at the Hive proxy, which routes to
|
||||||
|
multiple models including Gemini — so reusing the credentials lets
|
||||||
|
a Gemini-3-flash override still work without a separate
|
||||||
|
``GEMINI_API_KEY``. When no creds are configured, litellm falls
|
||||||
|
back to env-var resolution.
|
||||||
|
|
||||||
|
Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn``.
|
||||||
"""
|
"""
|
||||||
model = get_vision_fallback_model()
|
model = model_override or get_vision_fallback_model()
|
||||||
if not model:
|
if not model:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
api_key = get_vision_fallback_api_key()
|
api_key = get_vision_fallback_api_key()
|
||||||
api_base = get_vision_fallback_api_base()
|
api_base = get_vision_fallback_api_base()
|
||||||
if not api_key:
|
if not api_key and not model_override:
|
||||||
logger.debug("vision_fallback configured but no API key resolved; skipping")
|
logger.debug("vision_fallback configured but no API key resolved; skipping")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -195,8 +201,9 @@ async def caption_tool_image(
|
|||||||
"messages": messages,
|
"messages": messages,
|
||||||
"max_tokens": 1024,
|
"max_tokens": 1024,
|
||||||
"timeout": timeout_s,
|
"timeout": timeout_s,
|
||||||
"api_key": api_key,
|
|
||||||
}
|
}
|
||||||
|
if api_key:
|
||||||
|
kwargs["api_key"] = api_key
|
||||||
if api_base:
|
if api_base:
|
||||||
kwargs["api_base"] = api_base
|
kwargs["api_base"] = api_base
|
||||||
|
|
||||||
|
|||||||
@@ -162,9 +162,10 @@ def get_vision_fallback_model() -> str | None:
|
|||||||
Used by the agent-loop hook that captions tool-result images when the
|
Used by the agent-loop hook that captions tool-result images when the
|
||||||
main agent's model cannot accept image content (text-only LLMs).
|
main agent's model cannot accept image content (text-only LLMs).
|
||||||
|
|
||||||
When this returns None the fallback chain skips the configured-subagent
|
When this returns None the captioning chain's configured + retry
|
||||||
stage and proceeds straight to the generic caption rotation
|
attempts both no-op (returning None), and only the final
|
||||||
(``_describe_images_as_text``).
|
``gemini/gemini-3-flash-preview`` override has a chance to succeed
|
||||||
|
— and only if a ``GEMINI_API_KEY`` is set in the environment.
|
||||||
"""
|
"""
|
||||||
vision = get_hive_config().get("vision_fallback", {})
|
vision = get_hive_config().get("vision_fallback", {})
|
||||||
if vision.get("provider") and vision.get("model"):
|
if vision.get("provider") and vision.get("model"):
|
||||||
|
|||||||
Reference in New Issue
Block a user