diff --git a/core/framework/agent_loop/agent_loop.py b/core/framework/agent_loop/agent_loop.py index 07e2742d..58443df1 100644 --- a/core/framework/agent_loop/agent_loop.py +++ b/core/framework/agent_loop/agent_loop.py @@ -14,7 +14,6 @@ from __future__ import annotations import asyncio import json import logging -import os import re import time import uuid @@ -183,50 +182,20 @@ def _strip_internal_tags_from_snapshot(snapshot: str) -> str: async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> tuple[str, str] | None: - """Describe images using the best available vision model. + """Describe images for the injection-queue drain (no preceding tool call). - Returns ``(description, model)`` on success — the formatted - placeholder text plus the model id that produced it — or ``None`` - when every candidate fails or no API key is configured. + Wraps :func:`_captioning_chain` with a generic intent and returns + the caption inside an ``[image attached — description: …]`` envelope + so the injected text reads as image content rather than free-form + prose. """ - import litellm - - blocks: list[dict[str, Any]] = [ - { - "type": "text", - "text": ( - "Describe the following image(s) concisely but with enough detail " - "that a text-only AI assistant can understand the content and context." - ), - } - ] - blocks.extend(image_content) - - candidates: list[str] = [] - if os.environ.get("OPENAI_API_KEY"): - candidates.append("gpt-4o-mini") - if os.environ.get("ANTHROPIC_API_KEY"): - candidates.append("claude-3-haiku-20240307") - if os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY"): - candidates.append("gemini/gemini-1.5-flash") - - for model in candidates: - try: - response = await litellm.acompletion( - model=model, - messages=[{"role": "user", "content": blocks}], - max_tokens=512, - ) - description = (response.choices[0].message.content or "").strip() - if description: - count = len(image_content) - label = "image" if count == 1 else f"{count} images" - return f"[{label} attached — description: {description}]", model - except Exception as exc: - logger.debug("Vision fallback model '%s' failed: %s", model, exc) - continue - - return None + intent = "Describe the attached image(s) so a text-only agent can understand them." + result = await _captioning_chain(intent, image_content) + if not result: + return None + description, model = result + label = "image" if len(image_content) == 1 else f"{len(image_content)} images" + return f"[{label} attached — description: {description}]", model def _vision_fallback_active(model: str | None) -> bool: @@ -253,21 +222,22 @@ async def _captioning_chain( intent: str, image_content: list[dict[str, Any]], ) -> tuple[str, str] | None: - """Two-stage caption chain used by the agent-loop tool-result hook. + """Configured vision_fallback → retry → ``gemini/gemini-3-flash-preview``. - Stage 1: configured ``vision_fallback`` model with intent + images. - Stage 2: generic-caption rotation (gpt-4o-mini → claude-3-haiku - → gemini-flash) when stage 1 is unconfigured or fails. - - Returns ``(caption, model)`` — the caption text paired with the - model id that produced it — or ``None`` if both stages fail. - Caller is responsible for the placeholder-on-None and the splice - into the persisted tool-result content. + The Gemini override reuses the configured ``api_key`` / ``api_base``, + so a Hive subscriber (whose token routes to a multi-model proxy) + keeps coverage when their primary model glitches. Without + configured creds litellm falls through to env-based Gemini auth; + users with neither Hive nor a ``GEMINI_API_KEY`` simply lose the + third try. """ - result = await caption_tool_image(intent, image_content) - if not result: - result = await _describe_images_as_text(image_content) - return result + if result := await caption_tool_image(intent, image_content): + return result + logger.warning("vision_fallback failed; retrying configured model") + if result := await caption_tool_image(intent, image_content): + return result + logger.warning("vision_fallback retry failed; trying gemini-3-flash-preview") + return await caption_tool_image(intent, image_content, model_override="gemini/gemini-3-flash-preview") # Pattern for detecting context-window-exceeded errors across LLM providers. diff --git a/core/framework/agent_loop/internals/vision_fallback.py b/core/framework/agent_loop/internals/vision_fallback.py index 3162fd85..31f37e9c 100644 --- a/core/framework/agent_loop/internals/vision_fallback.py +++ b/core/framework/agent_loop/internals/vision_fallback.py @@ -18,9 +18,10 @@ This module provides: Both helpers degrade silently — return ``None`` / a placeholder rather than raise — so a vision-fallback failure can never kill the main -agent's run. The agent-loop call site is responsible for chaining -through to the existing generic-caption rotation -(``_describe_images_as_text``) on a None return. +agent's run. The agent-loop call site retries the configured model +once on a None return, then falls back to +``gemini/gemini-3-flash-preview`` via the ``model_override`` parameter +of :func:`caption_tool_image`. """ from __future__ import annotations @@ -156,25 +157,30 @@ async def caption_tool_image( image_content: list[dict[str, Any]], *, timeout_s: float = 30.0, + model_override: str | None = None, ) -> tuple[str, str] | None: """Caption the given images using the configured ``vision_fallback`` model. - Returns ``(caption, model)`` on success — the model's text response - paired with the model id that produced it — or ``None`` on any - failure (no config, no API key, timeout, exception, empty - response). Callers chain to the next stage of the fallback on None. + Returns ``(caption, model)`` on success or ``None`` on any failure + (no config, no API key, timeout, exception, empty response). - Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn`` so the - cost / latency / quality are auditable post-hoc, tagged with - ``execution_id="vision_fallback_subagent"``. + ``model_override`` swaps in a different litellm model id while + keeping the configured ``vision_fallback`` ``api_key`` / ``api_base`` + untouched. That's deliberate: Hive subscribers configure + ``vision_fallback`` to point at the Hive proxy, which routes to + multiple models including Gemini — so reusing the credentials lets + a Gemini-3-flash override still work without a separate + ``GEMINI_API_KEY``. When no creds are configured, litellm falls + back to env-var resolution. + + Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn``. """ - model = get_vision_fallback_model() + model = model_override or get_vision_fallback_model() if not model: return None - api_key = get_vision_fallback_api_key() api_base = get_vision_fallback_api_base() - if not api_key: + if not api_key and not model_override: logger.debug("vision_fallback configured but no API key resolved; skipping") return None @@ -195,8 +201,9 @@ async def caption_tool_image( "messages": messages, "max_tokens": 1024, "timeout": timeout_s, - "api_key": api_key, } + if api_key: + kwargs["api_key"] = api_key if api_base: kwargs["api_base"] = api_base diff --git a/core/framework/config.py b/core/framework/config.py index 4720b12f..c42209ff 100644 --- a/core/framework/config.py +++ b/core/framework/config.py @@ -162,9 +162,10 @@ def get_vision_fallback_model() -> str | None: Used by the agent-loop hook that captions tool-result images when the main agent's model cannot accept image content (text-only LLMs). - When this returns None the fallback chain skips the configured-subagent - stage and proceeds straight to the generic caption rotation - (``_describe_images_as_text``). + When this returns None the captioning chain's configured + retry + attempts both no-op (returning None), and only the final + ``gemini/gemini-3-flash-preview`` override has a chance to succeed + — and only if a ``GEMINI_API_KEY`` is set in the environment. """ vision = get_hive_config().get("vision_fallback", {}) if vision.get("provider") and vision.get("model"):