feat: image vision fallback

2026-04-23 21:24:56 -07:00
parent 2621fb88b1
commit ea9c163438
4 changed files with 559 additions and 6 deletions
@@ -85,7 +85,12 @@ from framework.agent_loop.internals.types import (
    JudgeVerdict,
    TriggerEvent,
 )
+from framework.agent_loop.internals.vision_fallback import (
+    caption_tool_image,
+    extract_intent_for_tool,
+)
 from framework.agent_loop.types import AgentContext, AgentProtocol, AgentResult
+from framework.config import get_vision_fallback_model
 from framework.host.event_bus import EventBus
 from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results
 from framework.llm.provider import Tool, ToolResult, ToolUse
@@ -219,6 +224,52 @@ async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> str |
    return None


+def _vision_fallback_active(model: str | None) -> bool:
+    """Return True if tool-result images for *model* should be routed
+    through the vision-fallback chain rather than sent to the model.
+
+    Trigger: the model appears in Hive's curated text-only deny list
+    (``capabilities.supports_image_tool_results`` returns False).
+    That list is the only reliable signal — LiteLLM's
+    ``supports_vision`` returns False for any unknown model
+    (including custom-served vision-capable models like Jackrong/Qwopus3.5)
+    so it cannot be used as a gate; and LiteLLM's openai chat
+    transformer doesn't strip image blocks anyway, so passing them
+    through to a vision-capable but litellm-unrecognised model still
+    works end-to-end.
+
+    The ``vision_fallback`` config block is the *substitution* model —
+    it doesn't widen the trigger. To force fallback for a model the
+    deny list doesn't cover yet, add it to
+    ``capabilities._TEXT_ONLY_MODEL_BARE_PREFIXES`` /
+    ``_TEXT_ONLY_PROVIDER_PREFIXES`` rather than relying on a runtime
+    config.
+    """
+    if not model:
+        return False
+    return not supports_image_tool_results(model)
+
+
+async def _captioning_chain(
+    intent: str,
+    image_content: list[dict[str, Any]],
+) -> str | None:
+    """Two-stage caption chain used by the agent-loop tool-result hook.
+
+    Stage 1: configured ``vision_fallback`` model with intent + images.
+    Stage 2: generic-caption rotation (gpt-4o-mini → claude-3-haiku
+    → gemini-flash) when stage 1 is unconfigured or fails.
+
+    Returns the caption text or None if both stages fail. Caller is
+    responsible for the placeholder-on-None and the splice into the
+    persisted tool-result content.
+    """
+    caption = await caption_tool_image(intent, image_content)
+    if not caption:
+        caption = await _describe_images_as_text(image_content)
+    return caption
+
+
 # Pattern for detecting context-window-exceeded errors across LLM providers.
 _CONTEXT_TOO_LARGE_RE = re.compile(
    r"context.{0,20}(length|window|limit|size)|"
@@ -625,8 +676,23 @@ class AgentLoop(AgentProtocol):
        # Hide image-producing tools from text-only models so they never try
        # to call them. Avoids wasted turns + "screenshot failed" lessons
        # getting saved to memory. See framework.llm.capabilities.
+        # EXCEPTION: when the model IS on the text-only deny list AND
+        # a vision_fallback subagent is configured, leave image tools
+        # visible. The post-execution hook in the inner tool loop
+        # will route each image_content through the fallback VLM and
+        # replace it with a text caption before the main agent sees
+        # the result — so the main agent gets captions instead of
+        # raw images, rather than losing the tool entirely. We DON'T
+        # bypass the filter for vision-capable models (that would be
+        # a no-op anyway — the filter doesn't fire for them) and we
+        # DON'T bypass it without a configured fallback (the agent
+        # would just see raw stripped tool results with no caption).
        _llm_model = ctx.llm.model if ctx.llm else ""
-        tools, _hidden_image_tools = filter_tools_for_model(tools, _llm_model)
+        _text_only_main = _llm_model and not supports_image_tool_results(_llm_model)
+        if _text_only_main and get_vision_fallback_model() is not None:
+            _hidden_image_tools: list[str] = []
+        else:
+            tools, _hidden_image_tools = filter_tools_for_model(tools, _llm_model)

        logger.info(
            "[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s | hidden_image_tools=%s",
@@ -3361,6 +3427,32 @@ class AgentLoop(AgentProtocol):

            # Phase 3: record results into conversation in original order,
            # build logged/real lists, and publish completed events.
+            #
+            # Vision-fallback prefetch: a single turn may fire several
+            # image-producing tools in parallel (e.g. one screenshot
+            # per tab). Captioning each one takes a vision LLM round
+            # trip (1–30 s). Doing them sequentially in this loop
+            # would serialise that latency per image. Instead, kick
+            # off all caption tasks concurrently NOW, and await each
+            # one just-in-time inside the per-tc body. If only a
+            # single image needs captioning, this collapses to a
+            # single await with no overhead.
+            _model_text_only = ctx.llm and _vision_fallback_active(ctx.llm.model)
+            caption_tasks: dict[str, asyncio.Task[str | None]] = {}
+            if _model_text_only:
+                for tc in tool_calls[:executed_in_batch]:
+                    res = results_by_id.get(tc.tool_use_id)
+                    if not res or not res.image_content:
+                        continue
+                    intent = extract_intent_for_tool(
+                        conversation,
+                        tc.tool_name,
+                        tc.tool_input or {},
+                    )
+                    caption_tasks[tc.tool_use_id] = asyncio.create_task(
+                        _captioning_chain(intent, res.image_content)
+                    )
+
            for tc in tool_calls[:executed_in_batch]:
                result = results_by_id.get(tc.tool_use_id)
                if result is None:
@@ -3383,11 +3475,31 @@ class AgentLoop(AgentProtocol):
                    logged_tool_calls.append(tool_entry)

                image_content = result.image_content
-                if image_content and ctx.llm and not supports_image_tool_results(ctx.llm.model):
-                    logger.info(
-                        "Stripping image_content from tool result; model '%s' does not support images in tool results",
-                        ctx.llm.model,
-                    )
+                # Vision-fallback marker spliced into the persisted text
+                # below. None when no captioning ran (vision-capable
+                # main model, no images, or no fallback chain reached
+                # this tool).
+                vision_fallback_marker: str | None = None
+                if image_content and tc.tool_use_id in caption_tasks:
+                    caption = await caption_tasks.pop(tc.tool_use_id)
+                    if caption:
+                        vision_fallback_marker = f"[vision-fallback caption]\n{caption}"
+                        logger.info(
+                            "vision_fallback: captioned %d image(s) for tool '%s' "
+                            "(model '%s' routed through fallback)",
+                            len(image_content),
+                            tc.tool_name,
+                            ctx.llm.model if ctx.llm else "?",
+                        )
+                    else:
+                        vision_fallback_marker = "[image stripped — vision fallback exhausted]"
+                        logger.info(
+                            "vision_fallback: exhausted; stripping %d image(s) from "
+                            "tool '%s' result without caption (model '%s')",
+                            len(image_content),
+                            tc.tool_name,
+                            ctx.llm.model if ctx.llm else "?",
+                        )
                    image_content = None

                # Apply replay-detector steer prefix if this call matched a
@@ -3399,6 +3511,11 @@ class AgentLoop(AgentProtocol):
                    if _prefix:
                        stored_content = f"{_prefix}{stored_content or ''}"

+                # Splice the vision-fallback caption / placeholder into
+                # the persisted text after any prefix has been applied.
+                if vision_fallback_marker:
+                    stored_content = f"{stored_content or ''}\n\n{vision_fallback_marker}"
+
                await conversation.add_tool_result(
                    tool_use_id=tc.tool_use_id,
                    content=stored_content,
@@ -0,0 +1,220 @@
+"""Vision-fallback subagent for tool-result images on text-only LLMs.
+
+When a tool returns image content but the main agent's model can't
+accept image blocks (per ``supports_image_tool_results``), the framework
+strips the images before they ever reach the LLM. Without this module,
+the agent then sees only the tool's text envelope (URL, dimensions,
+size) and is blind to whatever the image actually shows.
+
+This module provides:
+
+* ``caption_tool_image()`` — direct LiteLLM call to a configured
+  vision model (``vision_fallback`` block in ``~/.hive/configuration.json``)
+  that takes the agent's intent + the image(s) and returns a textual
+  description tailored to that intent.
+* ``extract_intent_for_tool()`` — pull the most recent assistant text
+  + the tool call descriptor and concatenate them into a ≤2KB intent
+  string the vision subagent can reason against.
+
+Both helpers degrade silently — return ``None`` / a placeholder rather
+than raise — so a vision-fallback failure can never kill the main
+agent's run. The agent-loop call site is responsible for chaining
+through to the existing generic-caption rotation
+(``_describe_images_as_text``) on a None return.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from datetime import datetime
+from typing import TYPE_CHECKING, Any
+
+from framework.config import (
+    get_vision_fallback_api_base,
+    get_vision_fallback_api_key,
+    get_vision_fallback_model,
+)
+
+if TYPE_CHECKING:
+    from ..conversation import NodeConversation
+
+logger = logging.getLogger(__name__)
+
+
+# Hard cap on the intent string handed to the vision subagent. The
+# subagent only needs the agent's recent reasoning + the tool descriptor;
+# anything longer is wasted tokens (and risks pushing past the vision
+# model's context with the image attached).
+_INTENT_MAX_CHARS = 4096
+
+# Cap on the tool args JSON snippet inside the intent. Some tool inputs
+# (large strings, file contents) would dominate the intent if uncapped.
+_TOOL_ARGS_MAX_CHARS = 4096
+
+# Subagent system prompt — kept short so it fits within any provider's
+# system-prompt budget alongside the user message + image. Tells the
+# subagent its role and constrains output format.
+_VISION_SUBAGENT_SYSTEM = (
+    "You are a vision subagent for a text-only main agent. The main "
+    "agent invoked a tool that returned the image(s) attached. Their "
+    "intent (their reasoning + the tool call) is below. Describe what "
+    "the image shows in service of their intent — concrete, factual, "
+    "no speculation. If their intent asks a yes/no question, answer it "
+    "directly first. Output plain text, no markdown, ≤ 600 words."
+)
+
+
+def extract_intent_for_tool(
+    conversation: NodeConversation,
+    tool_name: str,
+    tool_args: dict[str, Any] | None,
+) -> str:
+    """Build the intent string passed to the vision subagent.
+
+    Combines the most recent assistant text (the LLM's reasoning right
+    before invoking the tool) with a structured tool-call descriptor.
+    Truncates to ``_INTENT_MAX_CHARS`` total, favouring the head of the
+    assistant text where goal-stating sentences usually live.
+
+    If no preceding assistant text exists (rare — first turn), falls
+    back to ``"<no preceding reasoning>"`` so the subagent still gets
+    the tool descriptor.
+    """
+    args_json: str
+    try:
+        args_json = json.dumps(tool_args or {}, default=str)
+    except Exception:
+        args_json = repr(tool_args)
+    if len(args_json) > _TOOL_ARGS_MAX_CHARS:
+        args_json = args_json[:_TOOL_ARGS_MAX_CHARS] + "…"
+
+    tool_line = f"Called: {tool_name}({args_json})"
+
+    # Walk newest → oldest, take the first assistant message with text.
+    assistant_text = ""
+    try:
+        messages = getattr(conversation, "_messages", []) or []
+        for msg in reversed(messages):
+            if getattr(msg, "role", None) != "assistant":
+                continue
+            content = getattr(msg, "content", "") or ""
+            if isinstance(content, str) and content.strip():
+                assistant_text = content.strip()
+                break
+    except Exception:
+        # Defensive — the agent loop must keep running even if the
+        # conversation structure changes shape.
+        assistant_text = ""
+
+    if not assistant_text:
+        assistant_text = "<no preceding reasoning>"
+
+    # Intent = tool descriptor (always intact) + reasoning (truncated).
+    head = f"{tool_line}\n\nReasoning before call:\n"
+    budget = _INTENT_MAX_CHARS - len(head)
+    if budget < 100:
+        # Tool descriptor is huge somehow — truncate it.
+        return head[:_INTENT_MAX_CHARS]
+    if len(assistant_text) > budget:
+        assistant_text = assistant_text[: budget - 1] + "…"
+    return head + assistant_text
+
+
+async def caption_tool_image(
+    intent: str,
+    image_content: list[dict[str, Any]],
+    *,
+    timeout_s: float = 30.0,
+) -> str | None:
+    """Caption the given images using the configured ``vision_fallback`` model.
+
+    Returns the model's text response on success, or ``None`` on any
+    failure (no config, no API key, timeout, exception, empty
+    response). Callers chain to the next stage of the fallback on None.
+
+    Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn`` so the
+    cost / latency / quality are auditable post-hoc, tagged with
+    ``execution_id="vision_fallback_subagent"``.
+    """
+    model = get_vision_fallback_model()
+    if not model:
+        return None
+
+    api_key = get_vision_fallback_api_key()
+    api_base = get_vision_fallback_api_base()
+    if not api_key:
+        logger.debug("vision_fallback configured but no API key resolved; skipping")
+        return None
+
+    try:
+        import litellm
+    except ImportError:
+        return None
+
+    user_blocks: list[dict[str, Any]] = [{"type": "text", "text": intent}]
+    user_blocks.extend(image_content)
+    messages = [
+        {"role": "system", "content": _VISION_SUBAGENT_SYSTEM},
+        {"role": "user", "content": user_blocks},
+    ]
+
+    kwargs: dict[str, Any] = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": 1024,
+        "timeout": timeout_s,
+        "api_key": api_key,
+    }
+    if api_base:
+        kwargs["api_base"] = api_base
+
+    started = datetime.now()
+    caption: str | None = None
+    error_text: str | None = None
+    try:
+        response = await litellm.acompletion(**kwargs)
+        text = (response.choices[0].message.content or "").strip()
+        if text:
+            caption = text
+    except Exception as exc:
+        error_text = f"{type(exc).__name__}: {exc}"
+        logger.debug("vision_fallback model '%s' failed: %s", model, exc)
+
+    # Best-effort audit log so users can grep ~/.hive/llm_logs/ for
+    # vision-fallback subagent calls. Failures here must not bubble.
+    try:
+        from framework.tracker.llm_debug_logger import log_llm_turn
+
+        # Don't dump the base64 image data into the log file — that
+        # would balloon the jsonl with mostly-binary noise.
+        elided_blocks: list[dict[str, Any]] = [{"type": "text", "text": intent}]
+        elided_blocks.extend(
+            {"type": "image_url", "image_url": {"url": "<elided>"}}
+            for _ in range(len(image_content))
+        )
+        log_llm_turn(
+            node_id="vision_fallback_subagent",
+            stream_id="vision_fallback",
+            execution_id="vision_fallback_subagent",
+            iteration=0,
+            system_prompt=_VISION_SUBAGENT_SYSTEM,
+            messages=[{"role": "user", "content": elided_blocks}],
+            assistant_text=caption or "",
+            tool_calls=[],
+            tool_results=[],
+            token_counts={
+                "model": model,
+                "elapsed_s": (datetime.now() - started).total_seconds(),
+                "error": error_text,
+                "num_images": len(image_content),
+                "intent_chars": len(intent),
+            },
+        )
+    except Exception:
+        pass
+
+    return caption
+
+
+__all__ = ["caption_tool_image", "extract_intent_for_tool"]
@@ -155,6 +155,57 @@ def get_preferred_worker_model() -> str | None:
    return None


+def get_vision_fallback_model() -> str | None:
+    """Return the configured vision-fallback model, or None if not configured.
+
+    Reads from the ``vision_fallback`` section of ~/.hive/configuration.json.
+    Used by the agent-loop hook that captions tool-result images when the
+    main agent's model cannot accept image content (text-only LLMs).
+
+    When this returns None the fallback chain skips the configured-subagent
+    stage and proceeds straight to the generic caption rotation
+    (``_describe_images_as_text``).
+    """
+    vision = get_hive_config().get("vision_fallback", {})
+    if vision.get("provider") and vision.get("model"):
+        provider = str(vision["provider"])
+        model = str(vision["model"]).strip()
+        if provider.lower() == "openrouter" and model.lower().startswith("openrouter/"):
+            model = model[len("openrouter/") :]
+        if model:
+            return f"{provider}/{model}"
+    return None
+
+
+def get_vision_fallback_api_key() -> str | None:
+    """Return the API key for the vision-fallback model.
+
+    Resolution order: ``vision_fallback.api_key_env_var`` from the env,
+    then the default ``get_api_key()``. No subscription-token branches —
+    vision fallback is intended for hosted vision models (Anthropic,
+    OpenAI, Google), not for the subscription-bearer providers.
+    """
+    vision = get_hive_config().get("vision_fallback", {})
+    if not vision:
+        return get_api_key()
+    api_key_env_var = vision.get("api_key_env_var")
+    if api_key_env_var:
+        return os.environ.get(api_key_env_var)
+    return get_api_key()
+
+
+def get_vision_fallback_api_base() -> str | None:
+    """Return the api_base for the vision-fallback model, or None."""
+    vision = get_hive_config().get("vision_fallback", {})
+    if not vision:
+        return None
+    if vision.get("api_base"):
+        return vision["api_base"]
+    if str(vision.get("provider", "")).lower() == "openrouter":
+        return OPENROUTER_API_BASE
+    return None
+
+
 def get_worker_api_key() -> str | None:
    """Return the API key for the worker LLM, falling back to the default key."""
    worker_llm = get_hive_config().get("worker_llm", {})
@@ -1042,6 +1042,49 @@ print(json.dumps(config, indent=2))
 PY
 }

+save_vision_fallback() {
+    # Write the `vision_fallback` block to ~/.hive/configuration.json.
+    # Args: provider_id, model, env_var (api_key_env_var), api_base (optional)
+    # When provider_id is empty, REMOVE the block entirely (user opted out).
+    local provider_id="$1"
+    local model="$2"
+    local env_var="$3"
+    local api_base="${4:-}"
+
+    uv run python - "$provider_id" "$model" "$env_var" "$api_base" <<'PY'
+import json
+import sys
+from pathlib import Path
+
+provider_id, model, env_var, api_base = sys.argv[1:5]
+
+cfg_path = Path.home() / ".hive" / "configuration.json"
+cfg_path.parent.mkdir(parents=True, exist_ok=True)
+
+try:
+    with open(cfg_path, encoding="utf-8-sig") as f:
+        config = json.load(f)
+except (OSError, json.JSONDecodeError):
+    config = {}
+
+# Empty provider_id means the user opted out — drop the block.
+if not provider_id:
+    config.pop("vision_fallback", None)
+else:
+    block = {"provider": provider_id, "model": model}
+    if env_var:
+        block["api_key_env_var"] = env_var
+    if api_base:
+        block["api_base"] = api_base
+    config["vision_fallback"] = block
+
+tmp_path = cfg_path.with_name(cfg_path.name + ".tmp")
+with open(tmp_path, "w", encoding="utf-8") as f:
+    json.dump(config, f, indent=2)
+tmp_path.replace(cfg_path)
+PY
+}
+
 # Source shell rc file to pick up existing env vars (temporarily disable set -e)
 set +e
 if [ -f "$SHELL_RC_FILE" ]; then
@@ -1772,6 +1815,128 @@ fi

 echo ""

+# ============================================================
+# Vision Fallback (subagent for tool-result images)
+# ============================================================
+#
+# When a tool returns an image (browser_screenshot, render_image, etc.)
+# but the main agent's model is text-only, the framework can route the
+# image through a separate VLM subagent that returns a text caption,
+# preserving the agent's ability to reason about visual state.
+#
+# We always offer the prompt — even for vision-capable main models —
+# so the user gets a working fallback if they ever swap to a text-only
+# model. The block is dormant for vision-capable mains (the gating
+# in agent_loop only fires for models on Hive's deny list).
+
+if [ -n "$SELECTED_PROVIDER_ID" ]; then
+    echo -e "${YELLOW}⬢${NC} ${BLUE}${BOLD}Vision fallback subagent${NC}"
+    echo ""
+    echo -e "  ${DIM}When a screenshot/image tool is called from a text-only model,${NC}"
+    echo -e "  ${DIM}the framework can route the image through a vision-capable VLM${NC}"
+    echo -e "  ${DIM}and inject the caption into the conversation. Inert when your${NC}"
+    echo -e "  ${DIM}main model already supports vision (most do).${NC}"
+    echo ""
+
+    # Build the candidate list from the same model_catalog.json the main
+    # LLM step uses — never hardcode model IDs in this script. For each
+    # provider in the catalogue, take the catalogue's default model and
+    # the env var name it expects, then keep only providers the user
+    # already has an API key for. Output one TSV row per candidate:
+    # provider_id<TAB>model<TAB>env_var<TAB>display_name
+    VISION_CANDIDATES_TSV=$(uv run python - <<'PY'
+import os
+from framework.llm.model_catalog import get_default_models, get_models_catalogue
+
+# Map provider_id → the env-var name the framework reads its key from.
+# Mirrors PROVIDER_ENV_VARS at the top of quickstart.sh, plus how the
+# rest of the script picks an env var per provider.
+PROVIDER_KEY_ENV = {
+    "anthropic":  "ANTHROPIC_API_KEY",
+    "openai":     "OPENAI_API_KEY",
+    "gemini":     "GEMINI_API_KEY",
+    "groq":       "GROQ_API_KEY",
+    "cerebras":   "CEREBRAS_API_KEY",
+    "minimax":    "MINIMAX_API_KEY",
+    "mistral":    "MISTRAL_API_KEY",
+    "together":   "TOGETHER_API_KEY",
+    "deepseek":   "DEEPSEEK_API_KEY",
+    "kimi":       "KIMI_API_KEY",
+    "openrouter": "OPENROUTER_API_KEY",
+}
+
+defaults = get_default_models()
+catalog  = get_models_catalogue()
+for provider_id, default_model in sorted(defaults.items()):
+    env = PROVIDER_KEY_ENV.get(provider_id)
+    if not env:
+        continue
+    # GEMINI_API_KEY OR GOOGLE_API_KEY both unlock gemini
+    has_key = bool(os.environ.get(env))
+    if provider_id == "gemini" and not has_key:
+        if os.environ.get("GOOGLE_API_KEY"):
+            has_key = True
+            env = "GOOGLE_API_KEY"
+    if not has_key:
+        continue
+    # Display name: provider/model from the catalogue verbatim
+    display = f"{provider_id}/{default_model}"
+    print(f"{provider_id}\t{default_model}\t{env}\t{display}")
+PY
+)
+
+    if [ -z "$VISION_CANDIDATES_TSV" ]; then
+        echo -e "  ${YELLOW}No matching API keys detected for any catalog provider.${NC}"
+        echo -e "  ${DIM}Set an API key for any provider in model_catalog.json and rerun.${NC}"
+        echo -e "  ${DIM}Skipping for now — text-only models will lose image content silently.${NC}"
+    else
+        # Materialise into bash array for selection
+        VISION_CANDIDATES=()
+        while IFS= read -r line; do
+            [ -n "$line" ] && VISION_CANDIDATES+=("$line")
+        done <<< "$VISION_CANDIDATES_TSV"
+
+        echo -e "  ${BOLD}Available vision-fallback models${NC} ${DIM}(from model_catalog.json):${NC}"
+        echo -e "    ${DIM}0)${NC} (skip — don't configure vision fallback)"
+        idx=1
+        for entry in "${VISION_CANDIDATES[@]}"; do
+            IFS=$'\t' read -r _vp _vm _vk _vd <<< "$entry"
+            echo -e "    ${DIM}${idx})${NC} ${_vd} ${DIM}[\$${_vk}]${NC}"
+            idx=$((idx + 1))
+        done
+        echo ""
+        VISION_CHOICE=""
+        while true; do
+            read -r -p "  Pick a vision-fallback model [1-${#VISION_CANDIDATES[@]}, 0=skip, default=1]: " VISION_CHOICE || VISION_CHOICE=""
+            VISION_CHOICE="${VISION_CHOICE:-1}"
+            if [[ "$VISION_CHOICE" =~ ^[0-9]+$ ]] && \
+               [ "$VISION_CHOICE" -ge 0 ] && \
+               [ "$VISION_CHOICE" -le "${#VISION_CANDIDATES[@]}" ]; then
+                break
+            fi
+            echo -e "  ${YELLOW}Please enter 0 (skip) or 1-${#VISION_CANDIDATES[@]}.${NC}"
+        done
+
+        if [ "$VISION_CHOICE" = "0" ]; then
+            # Explicit skip — drop any prior block so config stays clean.
+            save_vision_fallback "" "" "" "" > /dev/null 2>&1 || true
+            echo -e "  ${DIM}skipped — no vision_fallback block written${NC}"
+        else
+            chosen="${VISION_CANDIDATES[$((VISION_CHOICE - 1))]}"
+            IFS=$'\t' read -r vf_provider vf_model vf_env vf_display <<< "$chosen"
+            echo -n "  Saving vision_fallback... "
+            if save_vision_fallback "$vf_provider" "$vf_model" "$vf_env" "" > /dev/null; then
+                echo -e "${GREEN}⬢${NC}"
+                echo -e "  ${DIM}vision_fallback: ${vf_display} (key from \$${vf_env})${NC}"
+            else
+                echo -e "${RED}failed${NC}"
+                echo -e "  ${YELLOW}Could not write vision_fallback to ~/.hive/configuration.json — non-fatal, edit manually if needed.${NC}"
+            fi
+        fi
+    fi
+    echo ""
+fi
+
 # ============================================================
 # Browser Automation (GCU) — always enabled
 # ============================================================