From 73511a3c59de873ad91746e5d21f561b73e0b949 Mon Sep 17 00:00:00 2001 From: Richard Tang Date: Thu, 30 Apr 2026 13:02:57 -0700 Subject: [PATCH] feat: vision fallback with intent --- core/framework/agent_loop/agent_loop.py | 21 +------ .../internals/cursor_persistence.py | 26 ++++++-- .../agent_loop/internals/vision_fallback.py | 59 +++++++++++++++++-- core/framework/llm/litellm.py | 58 ++++++++++++++++++ 4 files changed, 134 insertions(+), 30 deletions(-) diff --git a/core/framework/agent_loop/agent_loop.py b/core/framework/agent_loop/agent_loop.py index 58443df1..c54c06b3 100644 --- a/core/framework/agent_loop/agent_loop.py +++ b/core/framework/agent_loop/agent_loop.py @@ -181,23 +181,6 @@ def _strip_internal_tags_from_snapshot(snapshot: str) -> str: return cleaned -async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> tuple[str, str] | None: - """Describe images for the injection-queue drain (no preceding tool call). - - Wraps :func:`_captioning_chain` with a generic intent and returns - the caption inside an ``[image attached — description: …]`` envelope - so the injected text reads as image content rather than free-form - prose. - """ - intent = "Describe the attached image(s) so a text-only agent can understand them." - result = await _captioning_chain(intent, image_content) - if not result: - return None - description, model = result - label = "image" if len(image_content) == 1 else f"{len(image_content)} images" - return f"[{label} attached — description: {description}]", model - - def _vision_fallback_active(model: str | None) -> bool: """Return True if tool-result images for *model* should be routed through the vision-fallback chain rather than sent to the model. @@ -3435,7 +3418,7 @@ class AgentLoop(AgentProtocol): # single image needs captioning, this collapses to a # single await with no overhead. _model_text_only = ctx.llm and _vision_fallback_active(ctx.llm.model) - caption_tasks: dict[str, asyncio.Task[str | None]] = {} + caption_tasks: dict[str, asyncio.Task[tuple[str, str] | None]] = {} if _model_text_only: for tc in tool_calls[:executed_in_batch]: res = results_by_id.get(tc.tool_use_id) @@ -4139,7 +4122,7 @@ class AgentLoop(AgentProtocol): queue=self._injection_queue, conversation=conversation, ctx=ctx, - describe_images_as_text_fn=_describe_images_as_text, + caption_image_fn=_captioning_chain, ) async def _drain_trigger_queue(self, conversation: NodeConversation) -> int: diff --git a/core/framework/agent_loop/internals/cursor_persistence.py b/core/framework/agent_loop/internals/cursor_persistence.py index 8092b7b9..6829fd9d 100644 --- a/core/framework/agent_loop/internals/cursor_persistence.py +++ b/core/framework/agent_loop/internals/cursor_persistence.py @@ -162,9 +162,20 @@ async def drain_injection_queue( conversation: NodeConversation, *, ctx: NodeContext, - describe_images_as_text_fn: (Callable[[list[dict[str, Any]]], Awaitable[tuple[str, str] | None]] | None) = None, + caption_image_fn: ( + Callable[[str, list[dict[str, Any]]], Awaitable[tuple[str, str] | None]] | None + ) = None, ) -> int: - """Drain all pending injected events as user messages. Returns count.""" + """Drain all pending injected events as user messages. Returns count. + + ``caption_image_fn`` is the unified vision fallback hook. It takes + ``(intent, image_content)`` and returns ``(caption, model)`` on + success — the model id is logged so the destination is observable. + The user's typed ``content`` (the injected message body) is passed + as the intent so the captioner can answer the user's specific + question about the image rather than producing a generic + description; an empty content falls back to a generic intent. + """ count = 0 logger.debug( "[drain_injection_queue] Starting to drain queue, initial queue size: %s", @@ -184,10 +195,13 @@ async def drain_injection_queue( "Model '%s' does not support images; attempting vision fallback", ctx.llm.model, ) - if describe_images_as_text_fn is not None: - described = await describe_images_as_text_fn(image_content) - if described: - description, vision_model = described + if caption_image_fn is not None: + intent = content or ( + "Describe these user-injected images for a text-only agent." + ) + caption_result = await caption_image_fn(intent, image_content) + if caption_result: + description, vision_model = caption_result content = f"{content}\n\n{description}" if content else description logger.info( "[drain] image described as text via vision fallback (model '%s')", diff --git a/core/framework/agent_loop/internals/vision_fallback.py b/core/framework/agent_loop/internals/vision_fallback.py index 31f37e9c..4431eb87 100644 --- a/core/framework/agent_loop/internals/vision_fallback.py +++ b/core/framework/agent_loop/internals/vision_fallback.py @@ -196,16 +196,53 @@ async def caption_tool_image( {"role": "user", "content": user_blocks}, ] + # Apply the same proxy rewrites the main LLM provider uses so a + # `hive/...` / `kimi/...` model resolves to the right Anthropic- + # compatible endpoint with the right auth header. Without this, + # litellm doesn't know what `hive/kimi-k2.5` is and rejects the call + # with "LLM Provider NOT provided." + from framework.llm.litellm import rewrite_proxy_model + + rewritten_model, rewritten_base, extra_headers = rewrite_proxy_model( + model, api_key, api_base + ) + kwargs: dict[str, Any] = { - "model": model, + "model": rewritten_model, "messages": messages, "max_tokens": 1024, "timeout": timeout_s, } - if api_key: + # Pass api_key directly only when there are no proxy-rewritten + # extra_headers carrying the auth (e.g. the gemini-3-flash override + # path goes direct to Gemini, not through the Hive proxy). + if api_key and not extra_headers: kwargs["api_key"] = api_key - if api_base: - kwargs["api_base"] = api_base + if rewritten_base: + kwargs["api_base"] = rewritten_base + if extra_headers: + kwargs["extra_headers"] = extra_headers + + # Surface where the request is going so the user can verify the + # vision fallback is hitting the expected proxy / model. Redacts + # the API key to a length+head+tail digest so it can be cross- + # correlated with other auth-related log lines. + key_digest = ( + f"len={len(api_key)} {api_key[:8]}…{api_key[-4:]}" + if api_key and len(api_key) >= 12 + else f"len={len(api_key) if api_key else 0}" + ) + logger.info( + "[vision_fallback] dispatching: configured_model=%s rewritten_model=%s " + "api_base=%s api_key=%s images=%d intent_chars=%d timeout_s=%.1f", + model, + rewritten_model, + rewritten_base or "", + key_digest, + len(image_content), + len(intent), + timeout_s, + ) started = datetime.now() caption: str | None = None @@ -215,9 +252,21 @@ async def caption_tool_image( text = (response.choices[0].message.content or "").strip() if text: caption = text + logger.info( + "[vision_fallback] response: model=%s api_base=%s elapsed_s=%.2f chars=%d", + rewritten_model, + rewritten_base or "", + (datetime.now() - started).total_seconds(), + len(text), + ) except Exception as exc: error_text = f"{type(exc).__name__}: {exc}" - logger.debug("vision_fallback model '%s' failed: %s", model, exc) + logger.warning( + "[vision_fallback] failed: model=%s api_base=%s error=%s", + rewritten_model, + rewritten_base or "", + error_text, + ) # Best-effort audit log so users can grep ~/.hive/llm_logs/ for # vision-fallback subagent calls. Failures here must not bubble. diff --git a/core/framework/llm/litellm.py b/core/framework/llm/litellm.py index 349f911a..9fd7f5ac 100644 --- a/core/framework/llm/litellm.py +++ b/core/framework/llm/litellm.py @@ -211,6 +211,44 @@ def _ensure_ollama_chat_prefix(model: str) -> str: return model +def rewrite_proxy_model( + model: str, api_key: str | None, api_base: str | None +) -> tuple[str, str | None, dict[str, str]]: + """Apply Hive/Kimi proxy rewrites for any caller of ``litellm.acompletion``. + + Both the Hive LLM proxy and Kimi For Coding expose Anthropic-API- + compatible endpoints. LiteLLM doesn't recognise the ``hive/`` or + ``kimi/`` prefixes natively, so we rewrite them to ``anthropic/`` + here. For the Hive proxy we also stamp a Bearer token into + ``extra_headers`` because litellm's Anthropic handler only sends + ``x-api-key`` and the proxy expects ``Authorization: Bearer``. + + Used by ad-hoc ``litellm.acompletion`` callers (e.g. the vision- + fallback subagent in ``caption_tool_image``) so they hit the same + proxy with the same auth as the main agent's ``LiteLLMProvider``. + The provider's own ``__init__`` keeps its inlined rewrite for now — + this helper is the single source of truth for ad-hoc callers. + + Returns: (rewritten_model, normalised_api_base, extra_headers). + The ``extra_headers`` dict is non-empty only for the Hive proxy + (and only when ``api_key`` is provided). + """ + extra_headers: dict[str, str] = {} + if model.lower().startswith("kimi/"): + model = "anthropic/" + model[len("kimi/") :] + if api_base and api_base.rstrip("/").endswith("/v1"): + api_base = api_base.rstrip("/")[:-3] + elif model.lower().startswith("hive/"): + model = "anthropic/" + model[len("hive/") :] + if api_base and api_base.rstrip("/").endswith("/v1"): + api_base = api_base.rstrip("/")[:-3] + # Hive proxy expects Bearer auth; litellm's Anthropic handler + # only sends x-api-key without this nudge. + if api_key: + extra_headers["Authorization"] = f"Bearer {api_key}" + return model, api_base, extra_headers + + RATE_LIMIT_MAX_RETRIES = 10 RATE_LIMIT_BACKOFF_BASE = 2 # seconds RATE_LIMIT_MAX_DELAY = 120 # seconds - cap to prevent absurd waits @@ -963,6 +1001,7 @@ class LiteLLMProvider(LLMProvider): # Translate kimi/ prefix to anthropic/ so litellm uses the Anthropic # Messages API handler and routes to that endpoint — no special headers needed. _original_model = model + self._hive_proxy_auth = bool(_original_model.lower().startswith("hive/")) if _is_ollama_model(model): model = _ensure_ollama_chat_prefix(model) elif model.lower().startswith("kimi/"): @@ -1016,6 +1055,7 @@ class LiteLLMProvider(LLMProvider): these attributes in-place propagates to all callers on the next LLM call. """ _original_model = model + self._hive_proxy_auth = bool(_original_model.lower().startswith("hive/")) if _is_ollama_model(model): model = _ensure_ollama_chat_prefix(model) elif model.lower().startswith("kimi/"): @@ -1255,6 +1295,16 @@ class LiteLLMProvider(LLMProvider): # Ollama requires explicit tool_choice=auto for function calling # so future readers don't have to guess. kwargs.setdefault("tool_choice", "auto") + elif self._hive_proxy_auth: + # The Hive LLM proxy fronts GLM, which drifts into "explain + # the plan" mode on long-context turns instead of emitting + # tool_use blocks (verified 2026-04-28: tool_choice=null → + # text-only stop=stop; tool_choice=required → clean + # tool_use). Force a tool call when tools are available + # so queens can't get stuck in chat mode. Callers that + # legitimately want a non-tool turn can override via + # extra_kwargs. + kwargs.setdefault("tool_choice", "required") # Add response_format for structured output # LiteLLM passes this through to the underlying provider @@ -1492,6 +1542,10 @@ class LiteLLMProvider(LLMProvider): # Ollama requires explicit tool_choice=auto for function calling # so future readers don't have to guess. kwargs.setdefault("tool_choice", "auto") + elif self._hive_proxy_auth: + # See `complete()` for the rationale: GLM behind the Hive + # proxy needs forcing or it goes chat-mode on long contexts. + kwargs.setdefault("tool_choice", "required") if response_format: kwargs["response_format"] = response_format @@ -2276,6 +2330,10 @@ class LiteLLMProvider(LLMProvider): # Ollama requires explicit tool_choice=auto for function calling # so future readers don't have to guess. kwargs.setdefault("tool_choice", "auto") + elif self._hive_proxy_auth: + # See `complete()` for the rationale: GLM behind the Hive + # proxy needs forcing or it goes chat-mode on long contexts. + kwargs.setdefault("tool_choice", "required") if response_format: kwargs["response_format"] = response_format # The Codex ChatGPT backend (Responses API) rejects several params.