feat: vision fallback with intent
This commit is contained in:
@@ -181,23 +181,6 @@ def _strip_internal_tags_from_snapshot(snapshot: str) -> str:
|
||||
return cleaned
|
||||
|
||||
|
||||
async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> tuple[str, str] | None:
|
||||
"""Describe images for the injection-queue drain (no preceding tool call).
|
||||
|
||||
Wraps :func:`_captioning_chain` with a generic intent and returns
|
||||
the caption inside an ``[image attached — description: …]`` envelope
|
||||
so the injected text reads as image content rather than free-form
|
||||
prose.
|
||||
"""
|
||||
intent = "Describe the attached image(s) so a text-only agent can understand them."
|
||||
result = await _captioning_chain(intent, image_content)
|
||||
if not result:
|
||||
return None
|
||||
description, model = result
|
||||
label = "image" if len(image_content) == 1 else f"{len(image_content)} images"
|
||||
return f"[{label} attached — description: {description}]", model
|
||||
|
||||
|
||||
def _vision_fallback_active(model: str | None) -> bool:
|
||||
"""Return True if tool-result images for *model* should be routed
|
||||
through the vision-fallback chain rather than sent to the model.
|
||||
@@ -3435,7 +3418,7 @@ class AgentLoop(AgentProtocol):
|
||||
# single image needs captioning, this collapses to a
|
||||
# single await with no overhead.
|
||||
_model_text_only = ctx.llm and _vision_fallback_active(ctx.llm.model)
|
||||
caption_tasks: dict[str, asyncio.Task[str | None]] = {}
|
||||
caption_tasks: dict[str, asyncio.Task[tuple[str, str] | None]] = {}
|
||||
if _model_text_only:
|
||||
for tc in tool_calls[:executed_in_batch]:
|
||||
res = results_by_id.get(tc.tool_use_id)
|
||||
@@ -4139,7 +4122,7 @@ class AgentLoop(AgentProtocol):
|
||||
queue=self._injection_queue,
|
||||
conversation=conversation,
|
||||
ctx=ctx,
|
||||
describe_images_as_text_fn=_describe_images_as_text,
|
||||
caption_image_fn=_captioning_chain,
|
||||
)
|
||||
|
||||
async def _drain_trigger_queue(self, conversation: NodeConversation) -> int:
|
||||
|
||||
@@ -162,9 +162,20 @@ async def drain_injection_queue(
|
||||
conversation: NodeConversation,
|
||||
*,
|
||||
ctx: NodeContext,
|
||||
describe_images_as_text_fn: (Callable[[list[dict[str, Any]]], Awaitable[tuple[str, str] | None]] | None) = None,
|
||||
caption_image_fn: (
|
||||
Callable[[str, list[dict[str, Any]]], Awaitable[tuple[str, str] | None]] | None
|
||||
) = None,
|
||||
) -> int:
|
||||
"""Drain all pending injected events as user messages. Returns count."""
|
||||
"""Drain all pending injected events as user messages. Returns count.
|
||||
|
||||
``caption_image_fn`` is the unified vision fallback hook. It takes
|
||||
``(intent, image_content)`` and returns ``(caption, model)`` on
|
||||
success — the model id is logged so the destination is observable.
|
||||
The user's typed ``content`` (the injected message body) is passed
|
||||
as the intent so the captioner can answer the user's specific
|
||||
question about the image rather than producing a generic
|
||||
description; an empty content falls back to a generic intent.
|
||||
"""
|
||||
count = 0
|
||||
logger.debug(
|
||||
"[drain_injection_queue] Starting to drain queue, initial queue size: %s",
|
||||
@@ -184,10 +195,13 @@ async def drain_injection_queue(
|
||||
"Model '%s' does not support images; attempting vision fallback",
|
||||
ctx.llm.model,
|
||||
)
|
||||
if describe_images_as_text_fn is not None:
|
||||
described = await describe_images_as_text_fn(image_content)
|
||||
if described:
|
||||
description, vision_model = described
|
||||
if caption_image_fn is not None:
|
||||
intent = content or (
|
||||
"Describe these user-injected images for a text-only agent."
|
||||
)
|
||||
caption_result = await caption_image_fn(intent, image_content)
|
||||
if caption_result:
|
||||
description, vision_model = caption_result
|
||||
content = f"{content}\n\n{description}" if content else description
|
||||
logger.info(
|
||||
"[drain] image described as text via vision fallback (model '%s')",
|
||||
|
||||
@@ -196,16 +196,53 @@ async def caption_tool_image(
|
||||
{"role": "user", "content": user_blocks},
|
||||
]
|
||||
|
||||
# Apply the same proxy rewrites the main LLM provider uses so a
|
||||
# `hive/...` / `kimi/...` model resolves to the right Anthropic-
|
||||
# compatible endpoint with the right auth header. Without this,
|
||||
# litellm doesn't know what `hive/kimi-k2.5` is and rejects the call
|
||||
# with "LLM Provider NOT provided."
|
||||
from framework.llm.litellm import rewrite_proxy_model
|
||||
|
||||
rewritten_model, rewritten_base, extra_headers = rewrite_proxy_model(
|
||||
model, api_key, api_base
|
||||
)
|
||||
|
||||
kwargs: dict[str, Any] = {
|
||||
"model": model,
|
||||
"model": rewritten_model,
|
||||
"messages": messages,
|
||||
"max_tokens": 1024,
|
||||
"timeout": timeout_s,
|
||||
}
|
||||
if api_key:
|
||||
# Pass api_key directly only when there are no proxy-rewritten
|
||||
# extra_headers carrying the auth (e.g. the gemini-3-flash override
|
||||
# path goes direct to Gemini, not through the Hive proxy).
|
||||
if api_key and not extra_headers:
|
||||
kwargs["api_key"] = api_key
|
||||
if api_base:
|
||||
kwargs["api_base"] = api_base
|
||||
if rewritten_base:
|
||||
kwargs["api_base"] = rewritten_base
|
||||
if extra_headers:
|
||||
kwargs["extra_headers"] = extra_headers
|
||||
|
||||
# Surface where the request is going so the user can verify the
|
||||
# vision fallback is hitting the expected proxy / model. Redacts
|
||||
# the API key to a length+head+tail digest so it can be cross-
|
||||
# correlated with other auth-related log lines.
|
||||
key_digest = (
|
||||
f"len={len(api_key)} {api_key[:8]}…{api_key[-4:]}"
|
||||
if api_key and len(api_key) >= 12
|
||||
else f"len={len(api_key) if api_key else 0}"
|
||||
)
|
||||
logger.info(
|
||||
"[vision_fallback] dispatching: configured_model=%s rewritten_model=%s "
|
||||
"api_base=%s api_key=%s images=%d intent_chars=%d timeout_s=%.1f",
|
||||
model,
|
||||
rewritten_model,
|
||||
rewritten_base or "<litellm-default>",
|
||||
key_digest,
|
||||
len(image_content),
|
||||
len(intent),
|
||||
timeout_s,
|
||||
)
|
||||
|
||||
started = datetime.now()
|
||||
caption: str | None = None
|
||||
@@ -215,9 +252,21 @@ async def caption_tool_image(
|
||||
text = (response.choices[0].message.content or "").strip()
|
||||
if text:
|
||||
caption = text
|
||||
logger.info(
|
||||
"[vision_fallback] response: model=%s api_base=%s elapsed_s=%.2f chars=%d",
|
||||
rewritten_model,
|
||||
rewritten_base or "<litellm-default>",
|
||||
(datetime.now() - started).total_seconds(),
|
||||
len(text),
|
||||
)
|
||||
except Exception as exc:
|
||||
error_text = f"{type(exc).__name__}: {exc}"
|
||||
logger.debug("vision_fallback model '%s' failed: %s", model, exc)
|
||||
logger.warning(
|
||||
"[vision_fallback] failed: model=%s api_base=%s error=%s",
|
||||
rewritten_model,
|
||||
rewritten_base or "<litellm-default>",
|
||||
error_text,
|
||||
)
|
||||
|
||||
# Best-effort audit log so users can grep ~/.hive/llm_logs/ for
|
||||
# vision-fallback subagent calls. Failures here must not bubble.
|
||||
|
||||
@@ -211,6 +211,44 @@ def _ensure_ollama_chat_prefix(model: str) -> str:
|
||||
return model
|
||||
|
||||
|
||||
def rewrite_proxy_model(
|
||||
model: str, api_key: str | None, api_base: str | None
|
||||
) -> tuple[str, str | None, dict[str, str]]:
|
||||
"""Apply Hive/Kimi proxy rewrites for any caller of ``litellm.acompletion``.
|
||||
|
||||
Both the Hive LLM proxy and Kimi For Coding expose Anthropic-API-
|
||||
compatible endpoints. LiteLLM doesn't recognise the ``hive/`` or
|
||||
``kimi/`` prefixes natively, so we rewrite them to ``anthropic/``
|
||||
here. For the Hive proxy we also stamp a Bearer token into
|
||||
``extra_headers`` because litellm's Anthropic handler only sends
|
||||
``x-api-key`` and the proxy expects ``Authorization: Bearer``.
|
||||
|
||||
Used by ad-hoc ``litellm.acompletion`` callers (e.g. the vision-
|
||||
fallback subagent in ``caption_tool_image``) so they hit the same
|
||||
proxy with the same auth as the main agent's ``LiteLLMProvider``.
|
||||
The provider's own ``__init__`` keeps its inlined rewrite for now —
|
||||
this helper is the single source of truth for ad-hoc callers.
|
||||
|
||||
Returns: (rewritten_model, normalised_api_base, extra_headers).
|
||||
The ``extra_headers`` dict is non-empty only for the Hive proxy
|
||||
(and only when ``api_key`` is provided).
|
||||
"""
|
||||
extra_headers: dict[str, str] = {}
|
||||
if model.lower().startswith("kimi/"):
|
||||
model = "anthropic/" + model[len("kimi/") :]
|
||||
if api_base and api_base.rstrip("/").endswith("/v1"):
|
||||
api_base = api_base.rstrip("/")[:-3]
|
||||
elif model.lower().startswith("hive/"):
|
||||
model = "anthropic/" + model[len("hive/") :]
|
||||
if api_base and api_base.rstrip("/").endswith("/v1"):
|
||||
api_base = api_base.rstrip("/")[:-3]
|
||||
# Hive proxy expects Bearer auth; litellm's Anthropic handler
|
||||
# only sends x-api-key without this nudge.
|
||||
if api_key:
|
||||
extra_headers["Authorization"] = f"Bearer {api_key}"
|
||||
return model, api_base, extra_headers
|
||||
|
||||
|
||||
RATE_LIMIT_MAX_RETRIES = 10
|
||||
RATE_LIMIT_BACKOFF_BASE = 2 # seconds
|
||||
RATE_LIMIT_MAX_DELAY = 120 # seconds - cap to prevent absurd waits
|
||||
@@ -963,6 +1001,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
# Translate kimi/ prefix to anthropic/ so litellm uses the Anthropic
|
||||
# Messages API handler and routes to that endpoint — no special headers needed.
|
||||
_original_model = model
|
||||
self._hive_proxy_auth = bool(_original_model.lower().startswith("hive/"))
|
||||
if _is_ollama_model(model):
|
||||
model = _ensure_ollama_chat_prefix(model)
|
||||
elif model.lower().startswith("kimi/"):
|
||||
@@ -1016,6 +1055,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
these attributes in-place propagates to all callers on the next LLM call.
|
||||
"""
|
||||
_original_model = model
|
||||
self._hive_proxy_auth = bool(_original_model.lower().startswith("hive/"))
|
||||
if _is_ollama_model(model):
|
||||
model = _ensure_ollama_chat_prefix(model)
|
||||
elif model.lower().startswith("kimi/"):
|
||||
@@ -1255,6 +1295,16 @@ class LiteLLMProvider(LLMProvider):
|
||||
# Ollama requires explicit tool_choice=auto for function calling
|
||||
# so future readers don't have to guess.
|
||||
kwargs.setdefault("tool_choice", "auto")
|
||||
elif self._hive_proxy_auth:
|
||||
# The Hive LLM proxy fronts GLM, which drifts into "explain
|
||||
# the plan" mode on long-context turns instead of emitting
|
||||
# tool_use blocks (verified 2026-04-28: tool_choice=null →
|
||||
# text-only stop=stop; tool_choice=required → clean
|
||||
# tool_use). Force a tool call when tools are available
|
||||
# so queens can't get stuck in chat mode. Callers that
|
||||
# legitimately want a non-tool turn can override via
|
||||
# extra_kwargs.
|
||||
kwargs.setdefault("tool_choice", "required")
|
||||
|
||||
# Add response_format for structured output
|
||||
# LiteLLM passes this through to the underlying provider
|
||||
@@ -1492,6 +1542,10 @@ class LiteLLMProvider(LLMProvider):
|
||||
# Ollama requires explicit tool_choice=auto for function calling
|
||||
# so future readers don't have to guess.
|
||||
kwargs.setdefault("tool_choice", "auto")
|
||||
elif self._hive_proxy_auth:
|
||||
# See `complete()` for the rationale: GLM behind the Hive
|
||||
# proxy needs forcing or it goes chat-mode on long contexts.
|
||||
kwargs.setdefault("tool_choice", "required")
|
||||
if response_format:
|
||||
kwargs["response_format"] = response_format
|
||||
|
||||
@@ -2276,6 +2330,10 @@ class LiteLLMProvider(LLMProvider):
|
||||
# Ollama requires explicit tool_choice=auto for function calling
|
||||
# so future readers don't have to guess.
|
||||
kwargs.setdefault("tool_choice", "auto")
|
||||
elif self._hive_proxy_auth:
|
||||
# See `complete()` for the rationale: GLM behind the Hive
|
||||
# proxy needs forcing or it goes chat-mode on long contexts.
|
||||
kwargs.setdefault("tool_choice", "required")
|
||||
if response_format:
|
||||
kwargs["response_format"] = response_format
|
||||
# The Codex ChatGPT backend (Responses API) rejects several params.
|
||||
|
||||
Reference in New Issue
Block a user