feat: image vision fallback

This commit is contained in:
Timothy
2026-04-23 21:24:56 -07:00
parent 2621fb88b1
commit ea9c163438
4 changed files with 559 additions and 6 deletions
+123 -6
View File
@@ -85,7 +85,12 @@ from framework.agent_loop.internals.types import (
JudgeVerdict,
TriggerEvent,
)
from framework.agent_loop.internals.vision_fallback import (
caption_tool_image,
extract_intent_for_tool,
)
from framework.agent_loop.types import AgentContext, AgentProtocol, AgentResult
from framework.config import get_vision_fallback_model
from framework.host.event_bus import EventBus
from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results
from framework.llm.provider import Tool, ToolResult, ToolUse
@@ -219,6 +224,52 @@ async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> str |
return None
def _vision_fallback_active(model: str | None) -> bool:
"""Return True if tool-result images for *model* should be routed
through the vision-fallback chain rather than sent to the model.
Trigger: the model appears in Hive's curated text-only deny list
(``capabilities.supports_image_tool_results`` returns False).
That list is the only reliable signal LiteLLM's
``supports_vision`` returns False for any unknown model
(including custom-served vision-capable models like Jackrong/Qwopus3.5)
so it cannot be used as a gate; and LiteLLM's openai chat
transformer doesn't strip image blocks anyway, so passing them
through to a vision-capable but litellm-unrecognised model still
works end-to-end.
The ``vision_fallback`` config block is the *substitution* model
it doesn't widen the trigger. To force fallback for a model the
deny list doesn't cover yet, add it to
``capabilities._TEXT_ONLY_MODEL_BARE_PREFIXES`` /
``_TEXT_ONLY_PROVIDER_PREFIXES`` rather than relying on a runtime
config.
"""
if not model:
return False
return not supports_image_tool_results(model)
async def _captioning_chain(
intent: str,
image_content: list[dict[str, Any]],
) -> str | None:
"""Two-stage caption chain used by the agent-loop tool-result hook.
Stage 1: configured ``vision_fallback`` model with intent + images.
Stage 2: generic-caption rotation (gpt-4o-mini claude-3-haiku
gemini-flash) when stage 1 is unconfigured or fails.
Returns the caption text or None if both stages fail. Caller is
responsible for the placeholder-on-None and the splice into the
persisted tool-result content.
"""
caption = await caption_tool_image(intent, image_content)
if not caption:
caption = await _describe_images_as_text(image_content)
return caption
# Pattern for detecting context-window-exceeded errors across LLM providers.
_CONTEXT_TOO_LARGE_RE = re.compile(
r"context.{0,20}(length|window|limit|size)|"
@@ -625,8 +676,23 @@ class AgentLoop(AgentProtocol):
# Hide image-producing tools from text-only models so they never try
# to call them. Avoids wasted turns + "screenshot failed" lessons
# getting saved to memory. See framework.llm.capabilities.
# EXCEPTION: when the model IS on the text-only deny list AND
# a vision_fallback subagent is configured, leave image tools
# visible. The post-execution hook in the inner tool loop
# will route each image_content through the fallback VLM and
# replace it with a text caption before the main agent sees
# the result — so the main agent gets captions instead of
# raw images, rather than losing the tool entirely. We DON'T
# bypass the filter for vision-capable models (that would be
# a no-op anyway — the filter doesn't fire for them) and we
# DON'T bypass it without a configured fallback (the agent
# would just see raw stripped tool results with no caption).
_llm_model = ctx.llm.model if ctx.llm else ""
tools, _hidden_image_tools = filter_tools_for_model(tools, _llm_model)
_text_only_main = _llm_model and not supports_image_tool_results(_llm_model)
if _text_only_main and get_vision_fallback_model() is not None:
_hidden_image_tools: list[str] = []
else:
tools, _hidden_image_tools = filter_tools_for_model(tools, _llm_model)
logger.info(
"[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s | hidden_image_tools=%s",
@@ -3361,6 +3427,32 @@ class AgentLoop(AgentProtocol):
# Phase 3: record results into conversation in original order,
# build logged/real lists, and publish completed events.
#
# Vision-fallback prefetch: a single turn may fire several
# image-producing tools in parallel (e.g. one screenshot
# per tab). Captioning each one takes a vision LLM round
# trip (130 s). Doing them sequentially in this loop
# would serialise that latency per image. Instead, kick
# off all caption tasks concurrently NOW, and await each
# one just-in-time inside the per-tc body. If only a
# single image needs captioning, this collapses to a
# single await with no overhead.
_model_text_only = ctx.llm and _vision_fallback_active(ctx.llm.model)
caption_tasks: dict[str, asyncio.Task[str | None]] = {}
if _model_text_only:
for tc in tool_calls[:executed_in_batch]:
res = results_by_id.get(tc.tool_use_id)
if not res or not res.image_content:
continue
intent = extract_intent_for_tool(
conversation,
tc.tool_name,
tc.tool_input or {},
)
caption_tasks[tc.tool_use_id] = asyncio.create_task(
_captioning_chain(intent, res.image_content)
)
for tc in tool_calls[:executed_in_batch]:
result = results_by_id.get(tc.tool_use_id)
if result is None:
@@ -3383,11 +3475,31 @@ class AgentLoop(AgentProtocol):
logged_tool_calls.append(tool_entry)
image_content = result.image_content
if image_content and ctx.llm and not supports_image_tool_results(ctx.llm.model):
logger.info(
"Stripping image_content from tool result; model '%s' does not support images in tool results",
ctx.llm.model,
)
# Vision-fallback marker spliced into the persisted text
# below. None when no captioning ran (vision-capable
# main model, no images, or no fallback chain reached
# this tool).
vision_fallback_marker: str | None = None
if image_content and tc.tool_use_id in caption_tasks:
caption = await caption_tasks.pop(tc.tool_use_id)
if caption:
vision_fallback_marker = f"[vision-fallback caption]\n{caption}"
logger.info(
"vision_fallback: captioned %d image(s) for tool '%s' "
"(model '%s' routed through fallback)",
len(image_content),
tc.tool_name,
ctx.llm.model if ctx.llm else "?",
)
else:
vision_fallback_marker = "[image stripped — vision fallback exhausted]"
logger.info(
"vision_fallback: exhausted; stripping %d image(s) from "
"tool '%s' result without caption (model '%s')",
len(image_content),
tc.tool_name,
ctx.llm.model if ctx.llm else "?",
)
image_content = None
# Apply replay-detector steer prefix if this call matched a
@@ -3399,6 +3511,11 @@ class AgentLoop(AgentProtocol):
if _prefix:
stored_content = f"{_prefix}{stored_content or ''}"
# Splice the vision-fallback caption / placeholder into
# the persisted text after any prefix has been applied.
if vision_fallback_marker:
stored_content = f"{stored_content or ''}\n\n{vision_fallback_marker}"
await conversation.add_tool_result(
tool_use_id=tc.tool_use_id,
content=stored_content,
@@ -0,0 +1,220 @@
"""Vision-fallback subagent for tool-result images on text-only LLMs.
When a tool returns image content but the main agent's model can't
accept image blocks (per ``supports_image_tool_results``), the framework
strips the images before they ever reach the LLM. Without this module,
the agent then sees only the tool's text envelope (URL, dimensions,
size) and is blind to whatever the image actually shows.
This module provides:
* ``caption_tool_image()`` direct LiteLLM call to a configured
vision model (``vision_fallback`` block in ``~/.hive/configuration.json``)
that takes the agent's intent + the image(s) and returns a textual
description tailored to that intent.
* ``extract_intent_for_tool()`` pull the most recent assistant text
+ the tool call descriptor and concatenate them into a 2KB intent
string the vision subagent can reason against.
Both helpers degrade silently return ``None`` / a placeholder rather
than raise so a vision-fallback failure can never kill the main
agent's run. The agent-loop call site is responsible for chaining
through to the existing generic-caption rotation
(``_describe_images_as_text``) on a None return.
"""
from __future__ import annotations
import json
import logging
from datetime import datetime
from typing import TYPE_CHECKING, Any
from framework.config import (
get_vision_fallback_api_base,
get_vision_fallback_api_key,
get_vision_fallback_model,
)
if TYPE_CHECKING:
from ..conversation import NodeConversation
logger = logging.getLogger(__name__)
# Hard cap on the intent string handed to the vision subagent. The
# subagent only needs the agent's recent reasoning + the tool descriptor;
# anything longer is wasted tokens (and risks pushing past the vision
# model's context with the image attached).
_INTENT_MAX_CHARS = 4096
# Cap on the tool args JSON snippet inside the intent. Some tool inputs
# (large strings, file contents) would dominate the intent if uncapped.
_TOOL_ARGS_MAX_CHARS = 4096
# Subagent system prompt — kept short so it fits within any provider's
# system-prompt budget alongside the user message + image. Tells the
# subagent its role and constrains output format.
_VISION_SUBAGENT_SYSTEM = (
"You are a vision subagent for a text-only main agent. The main "
"agent invoked a tool that returned the image(s) attached. Their "
"intent (their reasoning + the tool call) is below. Describe what "
"the image shows in service of their intent — concrete, factual, "
"no speculation. If their intent asks a yes/no question, answer it "
"directly first. Output plain text, no markdown, ≤ 600 words."
)
def extract_intent_for_tool(
conversation: NodeConversation,
tool_name: str,
tool_args: dict[str, Any] | None,
) -> str:
"""Build the intent string passed to the vision subagent.
Combines the most recent assistant text (the LLM's reasoning right
before invoking the tool) with a structured tool-call descriptor.
Truncates to ``_INTENT_MAX_CHARS`` total, favouring the head of the
assistant text where goal-stating sentences usually live.
If no preceding assistant text exists (rare first turn), falls
back to ``"<no preceding reasoning>"`` so the subagent still gets
the tool descriptor.
"""
args_json: str
try:
args_json = json.dumps(tool_args or {}, default=str)
except Exception:
args_json = repr(tool_args)
if len(args_json) > _TOOL_ARGS_MAX_CHARS:
args_json = args_json[:_TOOL_ARGS_MAX_CHARS] + ""
tool_line = f"Called: {tool_name}({args_json})"
# Walk newest → oldest, take the first assistant message with text.
assistant_text = ""
try:
messages = getattr(conversation, "_messages", []) or []
for msg in reversed(messages):
if getattr(msg, "role", None) != "assistant":
continue
content = getattr(msg, "content", "") or ""
if isinstance(content, str) and content.strip():
assistant_text = content.strip()
break
except Exception:
# Defensive — the agent loop must keep running even if the
# conversation structure changes shape.
assistant_text = ""
if not assistant_text:
assistant_text = "<no preceding reasoning>"
# Intent = tool descriptor (always intact) + reasoning (truncated).
head = f"{tool_line}\n\nReasoning before call:\n"
budget = _INTENT_MAX_CHARS - len(head)
if budget < 100:
# Tool descriptor is huge somehow — truncate it.
return head[:_INTENT_MAX_CHARS]
if len(assistant_text) > budget:
assistant_text = assistant_text[: budget - 1] + ""
return head + assistant_text
async def caption_tool_image(
intent: str,
image_content: list[dict[str, Any]],
*,
timeout_s: float = 30.0,
) -> str | None:
"""Caption the given images using the configured ``vision_fallback`` model.
Returns the model's text response on success, or ``None`` on any
failure (no config, no API key, timeout, exception, empty
response). Callers chain to the next stage of the fallback on None.
Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn`` so the
cost / latency / quality are auditable post-hoc, tagged with
``execution_id="vision_fallback_subagent"``.
"""
model = get_vision_fallback_model()
if not model:
return None
api_key = get_vision_fallback_api_key()
api_base = get_vision_fallback_api_base()
if not api_key:
logger.debug("vision_fallback configured but no API key resolved; skipping")
return None
try:
import litellm
except ImportError:
return None
user_blocks: list[dict[str, Any]] = [{"type": "text", "text": intent}]
user_blocks.extend(image_content)
messages = [
{"role": "system", "content": _VISION_SUBAGENT_SYSTEM},
{"role": "user", "content": user_blocks},
]
kwargs: dict[str, Any] = {
"model": model,
"messages": messages,
"max_tokens": 1024,
"timeout": timeout_s,
"api_key": api_key,
}
if api_base:
kwargs["api_base"] = api_base
started = datetime.now()
caption: str | None = None
error_text: str | None = None
try:
response = await litellm.acompletion(**kwargs)
text = (response.choices[0].message.content or "").strip()
if text:
caption = text
except Exception as exc:
error_text = f"{type(exc).__name__}: {exc}"
logger.debug("vision_fallback model '%s' failed: %s", model, exc)
# Best-effort audit log so users can grep ~/.hive/llm_logs/ for
# vision-fallback subagent calls. Failures here must not bubble.
try:
from framework.tracker.llm_debug_logger import log_llm_turn
# Don't dump the base64 image data into the log file — that
# would balloon the jsonl with mostly-binary noise.
elided_blocks: list[dict[str, Any]] = [{"type": "text", "text": intent}]
elided_blocks.extend(
{"type": "image_url", "image_url": {"url": "<elided>"}}
for _ in range(len(image_content))
)
log_llm_turn(
node_id="vision_fallback_subagent",
stream_id="vision_fallback",
execution_id="vision_fallback_subagent",
iteration=0,
system_prompt=_VISION_SUBAGENT_SYSTEM,
messages=[{"role": "user", "content": elided_blocks}],
assistant_text=caption or "",
tool_calls=[],
tool_results=[],
token_counts={
"model": model,
"elapsed_s": (datetime.now() - started).total_seconds(),
"error": error_text,
"num_images": len(image_content),
"intent_chars": len(intent),
},
)
except Exception:
pass
return caption
__all__ = ["caption_tool_image", "extract_intent_for_tool"]
+51
View File
@@ -155,6 +155,57 @@ def get_preferred_worker_model() -> str | None:
return None
def get_vision_fallback_model() -> str | None:
"""Return the configured vision-fallback model, or None if not configured.
Reads from the ``vision_fallback`` section of ~/.hive/configuration.json.
Used by the agent-loop hook that captions tool-result images when the
main agent's model cannot accept image content (text-only LLMs).
When this returns None the fallback chain skips the configured-subagent
stage and proceeds straight to the generic caption rotation
(``_describe_images_as_text``).
"""
vision = get_hive_config().get("vision_fallback", {})
if vision.get("provider") and vision.get("model"):
provider = str(vision["provider"])
model = str(vision["model"]).strip()
if provider.lower() == "openrouter" and model.lower().startswith("openrouter/"):
model = model[len("openrouter/") :]
if model:
return f"{provider}/{model}"
return None
def get_vision_fallback_api_key() -> str | None:
"""Return the API key for the vision-fallback model.
Resolution order: ``vision_fallback.api_key_env_var`` from the env,
then the default ``get_api_key()``. No subscription-token branches
vision fallback is intended for hosted vision models (Anthropic,
OpenAI, Google), not for the subscription-bearer providers.
"""
vision = get_hive_config().get("vision_fallback", {})
if not vision:
return get_api_key()
api_key_env_var = vision.get("api_key_env_var")
if api_key_env_var:
return os.environ.get(api_key_env_var)
return get_api_key()
def get_vision_fallback_api_base() -> str | None:
"""Return the api_base for the vision-fallback model, or None."""
vision = get_hive_config().get("vision_fallback", {})
if not vision:
return None
if vision.get("api_base"):
return vision["api_base"]
if str(vision.get("provider", "")).lower() == "openrouter":
return OPENROUTER_API_BASE
return None
def get_worker_api_key() -> str | None:
"""Return the API key for the worker LLM, falling back to the default key."""
worker_llm = get_hive_config().get("worker_llm", {})
+165
View File
@@ -1042,6 +1042,49 @@ print(json.dumps(config, indent=2))
PY
}
save_vision_fallback() {
# Write the `vision_fallback` block to ~/.hive/configuration.json.
# Args: provider_id, model, env_var (api_key_env_var), api_base (optional)
# When provider_id is empty, REMOVE the block entirely (user opted out).
local provider_id="$1"
local model="$2"
local env_var="$3"
local api_base="${4:-}"
uv run python - "$provider_id" "$model" "$env_var" "$api_base" <<'PY'
import json
import sys
from pathlib import Path
provider_id, model, env_var, api_base = sys.argv[1:5]
cfg_path = Path.home() / ".hive" / "configuration.json"
cfg_path.parent.mkdir(parents=True, exist_ok=True)
try:
with open(cfg_path, encoding="utf-8-sig") as f:
config = json.load(f)
except (OSError, json.JSONDecodeError):
config = {}
# Empty provider_id means the user opted out — drop the block.
if not provider_id:
config.pop("vision_fallback", None)
else:
block = {"provider": provider_id, "model": model}
if env_var:
block["api_key_env_var"] = env_var
if api_base:
block["api_base"] = api_base
config["vision_fallback"] = block
tmp_path = cfg_path.with_name(cfg_path.name + ".tmp")
with open(tmp_path, "w", encoding="utf-8") as f:
json.dump(config, f, indent=2)
tmp_path.replace(cfg_path)
PY
}
# Source shell rc file to pick up existing env vars (temporarily disable set -e)
set +e
if [ -f "$SHELL_RC_FILE" ]; then
@@ -1772,6 +1815,128 @@ fi
echo ""
# ============================================================
# Vision Fallback (subagent for tool-result images)
# ============================================================
#
# When a tool returns an image (browser_screenshot, render_image, etc.)
# but the main agent's model is text-only, the framework can route the
# image through a separate VLM subagent that returns a text caption,
# preserving the agent's ability to reason about visual state.
#
# We always offer the prompt — even for vision-capable main models —
# so the user gets a working fallback if they ever swap to a text-only
# model. The block is dormant for vision-capable mains (the gating
# in agent_loop only fires for models on Hive's deny list).
if [ -n "$SELECTED_PROVIDER_ID" ]; then
echo -e "${YELLOW}${NC} ${BLUE}${BOLD}Vision fallback subagent${NC}"
echo ""
echo -e " ${DIM}When a screenshot/image tool is called from a text-only model,${NC}"
echo -e " ${DIM}the framework can route the image through a vision-capable VLM${NC}"
echo -e " ${DIM}and inject the caption into the conversation. Inert when your${NC}"
echo -e " ${DIM}main model already supports vision (most do).${NC}"
echo ""
# Build the candidate list from the same model_catalog.json the main
# LLM step uses — never hardcode model IDs in this script. For each
# provider in the catalogue, take the catalogue's default model and
# the env var name it expects, then keep only providers the user
# already has an API key for. Output one TSV row per candidate:
# provider_id<TAB>model<TAB>env_var<TAB>display_name
VISION_CANDIDATES_TSV=$(uv run python - <<'PY'
import os
from framework.llm.model_catalog import get_default_models, get_models_catalogue
# Map provider_id → the env-var name the framework reads its key from.
# Mirrors PROVIDER_ENV_VARS at the top of quickstart.sh, plus how the
# rest of the script picks an env var per provider.
PROVIDER_KEY_ENV = {
"anthropic": "ANTHROPIC_API_KEY",
"openai": "OPENAI_API_KEY",
"gemini": "GEMINI_API_KEY",
"groq": "GROQ_API_KEY",
"cerebras": "CEREBRAS_API_KEY",
"minimax": "MINIMAX_API_KEY",
"mistral": "MISTRAL_API_KEY",
"together": "TOGETHER_API_KEY",
"deepseek": "DEEPSEEK_API_KEY",
"kimi": "KIMI_API_KEY",
"openrouter": "OPENROUTER_API_KEY",
}
defaults = get_default_models()
catalog = get_models_catalogue()
for provider_id, default_model in sorted(defaults.items()):
env = PROVIDER_KEY_ENV.get(provider_id)
if not env:
continue
# GEMINI_API_KEY OR GOOGLE_API_KEY both unlock gemini
has_key = bool(os.environ.get(env))
if provider_id == "gemini" and not has_key:
if os.environ.get("GOOGLE_API_KEY"):
has_key = True
env = "GOOGLE_API_KEY"
if not has_key:
continue
# Display name: provider/model from the catalogue verbatim
display = f"{provider_id}/{default_model}"
print(f"{provider_id}\t{default_model}\t{env}\t{display}")
PY
)
if [ -z "$VISION_CANDIDATES_TSV" ]; then
echo -e " ${YELLOW}No matching API keys detected for any catalog provider.${NC}"
echo -e " ${DIM}Set an API key for any provider in model_catalog.json and rerun.${NC}"
echo -e " ${DIM}Skipping for now — text-only models will lose image content silently.${NC}"
else
# Materialise into bash array for selection
VISION_CANDIDATES=()
while IFS= read -r line; do
[ -n "$line" ] && VISION_CANDIDATES+=("$line")
done <<< "$VISION_CANDIDATES_TSV"
echo -e " ${BOLD}Available vision-fallback models${NC} ${DIM}(from model_catalog.json):${NC}"
echo -e " ${DIM}0)${NC} (skip — don't configure vision fallback)"
idx=1
for entry in "${VISION_CANDIDATES[@]}"; do
IFS=$'\t' read -r _vp _vm _vk _vd <<< "$entry"
echo -e " ${DIM}${idx})${NC} ${_vd} ${DIM}[\$${_vk}]${NC}"
idx=$((idx + 1))
done
echo ""
VISION_CHOICE=""
while true; do
read -r -p " Pick a vision-fallback model [1-${#VISION_CANDIDATES[@]}, 0=skip, default=1]: " VISION_CHOICE || VISION_CHOICE=""
VISION_CHOICE="${VISION_CHOICE:-1}"
if [[ "$VISION_CHOICE" =~ ^[0-9]+$ ]] && \
[ "$VISION_CHOICE" -ge 0 ] && \
[ "$VISION_CHOICE" -le "${#VISION_CANDIDATES[@]}" ]; then
break
fi
echo -e " ${YELLOW}Please enter 0 (skip) or 1-${#VISION_CANDIDATES[@]}.${NC}"
done
if [ "$VISION_CHOICE" = "0" ]; then
# Explicit skip — drop any prior block so config stays clean.
save_vision_fallback "" "" "" "" > /dev/null 2>&1 || true
echo -e " ${DIM}skipped — no vision_fallback block written${NC}"
else
chosen="${VISION_CANDIDATES[$((VISION_CHOICE - 1))]}"
IFS=$'\t' read -r vf_provider vf_model vf_env vf_display <<< "$chosen"
echo -n " Saving vision_fallback... "
if save_vision_fallback "$vf_provider" "$vf_model" "$vf_env" "" > /dev/null; then
echo -e "${GREEN}${NC}"
echo -e " ${DIM}vision_fallback: ${vf_display} (key from \$${vf_env})${NC}"
else
echo -e "${RED}failed${NC}"
echo -e " ${YELLOW}Could not write vision_fallback to ~/.hive/configuration.json — non-fatal, edit manually if needed.${NC}"
fi
fi
fi
echo ""
fi
# ============================================================
# Browser Automation (GCU) — always enabled
# ============================================================