feat: image vision fallback
This commit is contained in:
@@ -85,7 +85,12 @@ from framework.agent_loop.internals.types import (
|
||||
JudgeVerdict,
|
||||
TriggerEvent,
|
||||
)
|
||||
from framework.agent_loop.internals.vision_fallback import (
|
||||
caption_tool_image,
|
||||
extract_intent_for_tool,
|
||||
)
|
||||
from framework.agent_loop.types import AgentContext, AgentProtocol, AgentResult
|
||||
from framework.config import get_vision_fallback_model
|
||||
from framework.host.event_bus import EventBus
|
||||
from framework.llm.capabilities import filter_tools_for_model, supports_image_tool_results
|
||||
from framework.llm.provider import Tool, ToolResult, ToolUse
|
||||
@@ -219,6 +224,52 @@ async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> str |
|
||||
return None
|
||||
|
||||
|
||||
def _vision_fallback_active(model: str | None) -> bool:
|
||||
"""Return True if tool-result images for *model* should be routed
|
||||
through the vision-fallback chain rather than sent to the model.
|
||||
|
||||
Trigger: the model appears in Hive's curated text-only deny list
|
||||
(``capabilities.supports_image_tool_results`` returns False).
|
||||
That list is the only reliable signal — LiteLLM's
|
||||
``supports_vision`` returns False for any unknown model
|
||||
(including custom-served vision-capable models like Jackrong/Qwopus3.5)
|
||||
so it cannot be used as a gate; and LiteLLM's openai chat
|
||||
transformer doesn't strip image blocks anyway, so passing them
|
||||
through to a vision-capable but litellm-unrecognised model still
|
||||
works end-to-end.
|
||||
|
||||
The ``vision_fallback`` config block is the *substitution* model —
|
||||
it doesn't widen the trigger. To force fallback for a model the
|
||||
deny list doesn't cover yet, add it to
|
||||
``capabilities._TEXT_ONLY_MODEL_BARE_PREFIXES`` /
|
||||
``_TEXT_ONLY_PROVIDER_PREFIXES`` rather than relying on a runtime
|
||||
config.
|
||||
"""
|
||||
if not model:
|
||||
return False
|
||||
return not supports_image_tool_results(model)
|
||||
|
||||
|
||||
async def _captioning_chain(
|
||||
intent: str,
|
||||
image_content: list[dict[str, Any]],
|
||||
) -> str | None:
|
||||
"""Two-stage caption chain used by the agent-loop tool-result hook.
|
||||
|
||||
Stage 1: configured ``vision_fallback`` model with intent + images.
|
||||
Stage 2: generic-caption rotation (gpt-4o-mini → claude-3-haiku
|
||||
→ gemini-flash) when stage 1 is unconfigured or fails.
|
||||
|
||||
Returns the caption text or None if both stages fail. Caller is
|
||||
responsible for the placeholder-on-None and the splice into the
|
||||
persisted tool-result content.
|
||||
"""
|
||||
caption = await caption_tool_image(intent, image_content)
|
||||
if not caption:
|
||||
caption = await _describe_images_as_text(image_content)
|
||||
return caption
|
||||
|
||||
|
||||
# Pattern for detecting context-window-exceeded errors across LLM providers.
|
||||
_CONTEXT_TOO_LARGE_RE = re.compile(
|
||||
r"context.{0,20}(length|window|limit|size)|"
|
||||
@@ -625,8 +676,23 @@ class AgentLoop(AgentProtocol):
|
||||
# Hide image-producing tools from text-only models so they never try
|
||||
# to call them. Avoids wasted turns + "screenshot failed" lessons
|
||||
# getting saved to memory. See framework.llm.capabilities.
|
||||
# EXCEPTION: when the model IS on the text-only deny list AND
|
||||
# a vision_fallback subagent is configured, leave image tools
|
||||
# visible. The post-execution hook in the inner tool loop
|
||||
# will route each image_content through the fallback VLM and
|
||||
# replace it with a text caption before the main agent sees
|
||||
# the result — so the main agent gets captions instead of
|
||||
# raw images, rather than losing the tool entirely. We DON'T
|
||||
# bypass the filter for vision-capable models (that would be
|
||||
# a no-op anyway — the filter doesn't fire for them) and we
|
||||
# DON'T bypass it without a configured fallback (the agent
|
||||
# would just see raw stripped tool results with no caption).
|
||||
_llm_model = ctx.llm.model if ctx.llm else ""
|
||||
tools, _hidden_image_tools = filter_tools_for_model(tools, _llm_model)
|
||||
_text_only_main = _llm_model and not supports_image_tool_results(_llm_model)
|
||||
if _text_only_main and get_vision_fallback_model() is not None:
|
||||
_hidden_image_tools: list[str] = []
|
||||
else:
|
||||
tools, _hidden_image_tools = filter_tools_for_model(tools, _llm_model)
|
||||
|
||||
logger.info(
|
||||
"[%s] Tools available (%d): %s | direct_user_io=%s | judge=%s | hidden_image_tools=%s",
|
||||
@@ -3361,6 +3427,32 @@ class AgentLoop(AgentProtocol):
|
||||
|
||||
# Phase 3: record results into conversation in original order,
|
||||
# build logged/real lists, and publish completed events.
|
||||
#
|
||||
# Vision-fallback prefetch: a single turn may fire several
|
||||
# image-producing tools in parallel (e.g. one screenshot
|
||||
# per tab). Captioning each one takes a vision LLM round
|
||||
# trip (1–30 s). Doing them sequentially in this loop
|
||||
# would serialise that latency per image. Instead, kick
|
||||
# off all caption tasks concurrently NOW, and await each
|
||||
# one just-in-time inside the per-tc body. If only a
|
||||
# single image needs captioning, this collapses to a
|
||||
# single await with no overhead.
|
||||
_model_text_only = ctx.llm and _vision_fallback_active(ctx.llm.model)
|
||||
caption_tasks: dict[str, asyncio.Task[str | None]] = {}
|
||||
if _model_text_only:
|
||||
for tc in tool_calls[:executed_in_batch]:
|
||||
res = results_by_id.get(tc.tool_use_id)
|
||||
if not res or not res.image_content:
|
||||
continue
|
||||
intent = extract_intent_for_tool(
|
||||
conversation,
|
||||
tc.tool_name,
|
||||
tc.tool_input or {},
|
||||
)
|
||||
caption_tasks[tc.tool_use_id] = asyncio.create_task(
|
||||
_captioning_chain(intent, res.image_content)
|
||||
)
|
||||
|
||||
for tc in tool_calls[:executed_in_batch]:
|
||||
result = results_by_id.get(tc.tool_use_id)
|
||||
if result is None:
|
||||
@@ -3383,11 +3475,31 @@ class AgentLoop(AgentProtocol):
|
||||
logged_tool_calls.append(tool_entry)
|
||||
|
||||
image_content = result.image_content
|
||||
if image_content and ctx.llm and not supports_image_tool_results(ctx.llm.model):
|
||||
logger.info(
|
||||
"Stripping image_content from tool result; model '%s' does not support images in tool results",
|
||||
ctx.llm.model,
|
||||
)
|
||||
# Vision-fallback marker spliced into the persisted text
|
||||
# below. None when no captioning ran (vision-capable
|
||||
# main model, no images, or no fallback chain reached
|
||||
# this tool).
|
||||
vision_fallback_marker: str | None = None
|
||||
if image_content and tc.tool_use_id in caption_tasks:
|
||||
caption = await caption_tasks.pop(tc.tool_use_id)
|
||||
if caption:
|
||||
vision_fallback_marker = f"[vision-fallback caption]\n{caption}"
|
||||
logger.info(
|
||||
"vision_fallback: captioned %d image(s) for tool '%s' "
|
||||
"(model '%s' routed through fallback)",
|
||||
len(image_content),
|
||||
tc.tool_name,
|
||||
ctx.llm.model if ctx.llm else "?",
|
||||
)
|
||||
else:
|
||||
vision_fallback_marker = "[image stripped — vision fallback exhausted]"
|
||||
logger.info(
|
||||
"vision_fallback: exhausted; stripping %d image(s) from "
|
||||
"tool '%s' result without caption (model '%s')",
|
||||
len(image_content),
|
||||
tc.tool_name,
|
||||
ctx.llm.model if ctx.llm else "?",
|
||||
)
|
||||
image_content = None
|
||||
|
||||
# Apply replay-detector steer prefix if this call matched a
|
||||
@@ -3399,6 +3511,11 @@ class AgentLoop(AgentProtocol):
|
||||
if _prefix:
|
||||
stored_content = f"{_prefix}{stored_content or ''}"
|
||||
|
||||
# Splice the vision-fallback caption / placeholder into
|
||||
# the persisted text after any prefix has been applied.
|
||||
if vision_fallback_marker:
|
||||
stored_content = f"{stored_content or ''}\n\n{vision_fallback_marker}"
|
||||
|
||||
await conversation.add_tool_result(
|
||||
tool_use_id=tc.tool_use_id,
|
||||
content=stored_content,
|
||||
|
||||
@@ -0,0 +1,220 @@
|
||||
"""Vision-fallback subagent for tool-result images on text-only LLMs.
|
||||
|
||||
When a tool returns image content but the main agent's model can't
|
||||
accept image blocks (per ``supports_image_tool_results``), the framework
|
||||
strips the images before they ever reach the LLM. Without this module,
|
||||
the agent then sees only the tool's text envelope (URL, dimensions,
|
||||
size) and is blind to whatever the image actually shows.
|
||||
|
||||
This module provides:
|
||||
|
||||
* ``caption_tool_image()`` — direct LiteLLM call to a configured
|
||||
vision model (``vision_fallback`` block in ``~/.hive/configuration.json``)
|
||||
that takes the agent's intent + the image(s) and returns a textual
|
||||
description tailored to that intent.
|
||||
* ``extract_intent_for_tool()`` — pull the most recent assistant text
|
||||
+ the tool call descriptor and concatenate them into a ≤2KB intent
|
||||
string the vision subagent can reason against.
|
||||
|
||||
Both helpers degrade silently — return ``None`` / a placeholder rather
|
||||
than raise — so a vision-fallback failure can never kill the main
|
||||
agent's run. The agent-loop call site is responsible for chaining
|
||||
through to the existing generic-caption rotation
|
||||
(``_describe_images_as_text``) on a None return.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from framework.config import (
|
||||
get_vision_fallback_api_base,
|
||||
get_vision_fallback_api_key,
|
||||
get_vision_fallback_model,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..conversation import NodeConversation
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Hard cap on the intent string handed to the vision subagent. The
|
||||
# subagent only needs the agent's recent reasoning + the tool descriptor;
|
||||
# anything longer is wasted tokens (and risks pushing past the vision
|
||||
# model's context with the image attached).
|
||||
_INTENT_MAX_CHARS = 4096
|
||||
|
||||
# Cap on the tool args JSON snippet inside the intent. Some tool inputs
|
||||
# (large strings, file contents) would dominate the intent if uncapped.
|
||||
_TOOL_ARGS_MAX_CHARS = 4096
|
||||
|
||||
# Subagent system prompt — kept short so it fits within any provider's
|
||||
# system-prompt budget alongside the user message + image. Tells the
|
||||
# subagent its role and constrains output format.
|
||||
_VISION_SUBAGENT_SYSTEM = (
|
||||
"You are a vision subagent for a text-only main agent. The main "
|
||||
"agent invoked a tool that returned the image(s) attached. Their "
|
||||
"intent (their reasoning + the tool call) is below. Describe what "
|
||||
"the image shows in service of their intent — concrete, factual, "
|
||||
"no speculation. If their intent asks a yes/no question, answer it "
|
||||
"directly first. Output plain text, no markdown, ≤ 600 words."
|
||||
)
|
||||
|
||||
|
||||
def extract_intent_for_tool(
|
||||
conversation: NodeConversation,
|
||||
tool_name: str,
|
||||
tool_args: dict[str, Any] | None,
|
||||
) -> str:
|
||||
"""Build the intent string passed to the vision subagent.
|
||||
|
||||
Combines the most recent assistant text (the LLM's reasoning right
|
||||
before invoking the tool) with a structured tool-call descriptor.
|
||||
Truncates to ``_INTENT_MAX_CHARS`` total, favouring the head of the
|
||||
assistant text where goal-stating sentences usually live.
|
||||
|
||||
If no preceding assistant text exists (rare — first turn), falls
|
||||
back to ``"<no preceding reasoning>"`` so the subagent still gets
|
||||
the tool descriptor.
|
||||
"""
|
||||
args_json: str
|
||||
try:
|
||||
args_json = json.dumps(tool_args or {}, default=str)
|
||||
except Exception:
|
||||
args_json = repr(tool_args)
|
||||
if len(args_json) > _TOOL_ARGS_MAX_CHARS:
|
||||
args_json = args_json[:_TOOL_ARGS_MAX_CHARS] + "…"
|
||||
|
||||
tool_line = f"Called: {tool_name}({args_json})"
|
||||
|
||||
# Walk newest → oldest, take the first assistant message with text.
|
||||
assistant_text = ""
|
||||
try:
|
||||
messages = getattr(conversation, "_messages", []) or []
|
||||
for msg in reversed(messages):
|
||||
if getattr(msg, "role", None) != "assistant":
|
||||
continue
|
||||
content = getattr(msg, "content", "") or ""
|
||||
if isinstance(content, str) and content.strip():
|
||||
assistant_text = content.strip()
|
||||
break
|
||||
except Exception:
|
||||
# Defensive — the agent loop must keep running even if the
|
||||
# conversation structure changes shape.
|
||||
assistant_text = ""
|
||||
|
||||
if not assistant_text:
|
||||
assistant_text = "<no preceding reasoning>"
|
||||
|
||||
# Intent = tool descriptor (always intact) + reasoning (truncated).
|
||||
head = f"{tool_line}\n\nReasoning before call:\n"
|
||||
budget = _INTENT_MAX_CHARS - len(head)
|
||||
if budget < 100:
|
||||
# Tool descriptor is huge somehow — truncate it.
|
||||
return head[:_INTENT_MAX_CHARS]
|
||||
if len(assistant_text) > budget:
|
||||
assistant_text = assistant_text[: budget - 1] + "…"
|
||||
return head + assistant_text
|
||||
|
||||
|
||||
async def caption_tool_image(
|
||||
intent: str,
|
||||
image_content: list[dict[str, Any]],
|
||||
*,
|
||||
timeout_s: float = 30.0,
|
||||
) -> str | None:
|
||||
"""Caption the given images using the configured ``vision_fallback`` model.
|
||||
|
||||
Returns the model's text response on success, or ``None`` on any
|
||||
failure (no config, no API key, timeout, exception, empty
|
||||
response). Callers chain to the next stage of the fallback on None.
|
||||
|
||||
Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn`` so the
|
||||
cost / latency / quality are auditable post-hoc, tagged with
|
||||
``execution_id="vision_fallback_subagent"``.
|
||||
"""
|
||||
model = get_vision_fallback_model()
|
||||
if not model:
|
||||
return None
|
||||
|
||||
api_key = get_vision_fallback_api_key()
|
||||
api_base = get_vision_fallback_api_base()
|
||||
if not api_key:
|
||||
logger.debug("vision_fallback configured but no API key resolved; skipping")
|
||||
return None
|
||||
|
||||
try:
|
||||
import litellm
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
user_blocks: list[dict[str, Any]] = [{"type": "text", "text": intent}]
|
||||
user_blocks.extend(image_content)
|
||||
messages = [
|
||||
{"role": "system", "content": _VISION_SUBAGENT_SYSTEM},
|
||||
{"role": "user", "content": user_blocks},
|
||||
]
|
||||
|
||||
kwargs: dict[str, Any] = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"max_tokens": 1024,
|
||||
"timeout": timeout_s,
|
||||
"api_key": api_key,
|
||||
}
|
||||
if api_base:
|
||||
kwargs["api_base"] = api_base
|
||||
|
||||
started = datetime.now()
|
||||
caption: str | None = None
|
||||
error_text: str | None = None
|
||||
try:
|
||||
response = await litellm.acompletion(**kwargs)
|
||||
text = (response.choices[0].message.content or "").strip()
|
||||
if text:
|
||||
caption = text
|
||||
except Exception as exc:
|
||||
error_text = f"{type(exc).__name__}: {exc}"
|
||||
logger.debug("vision_fallback model '%s' failed: %s", model, exc)
|
||||
|
||||
# Best-effort audit log so users can grep ~/.hive/llm_logs/ for
|
||||
# vision-fallback subagent calls. Failures here must not bubble.
|
||||
try:
|
||||
from framework.tracker.llm_debug_logger import log_llm_turn
|
||||
|
||||
# Don't dump the base64 image data into the log file — that
|
||||
# would balloon the jsonl with mostly-binary noise.
|
||||
elided_blocks: list[dict[str, Any]] = [{"type": "text", "text": intent}]
|
||||
elided_blocks.extend(
|
||||
{"type": "image_url", "image_url": {"url": "<elided>"}}
|
||||
for _ in range(len(image_content))
|
||||
)
|
||||
log_llm_turn(
|
||||
node_id="vision_fallback_subagent",
|
||||
stream_id="vision_fallback",
|
||||
execution_id="vision_fallback_subagent",
|
||||
iteration=0,
|
||||
system_prompt=_VISION_SUBAGENT_SYSTEM,
|
||||
messages=[{"role": "user", "content": elided_blocks}],
|
||||
assistant_text=caption or "",
|
||||
tool_calls=[],
|
||||
tool_results=[],
|
||||
token_counts={
|
||||
"model": model,
|
||||
"elapsed_s": (datetime.now() - started).total_seconds(),
|
||||
"error": error_text,
|
||||
"num_images": len(image_content),
|
||||
"intent_chars": len(intent),
|
||||
},
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return caption
|
||||
|
||||
|
||||
__all__ = ["caption_tool_image", "extract_intent_for_tool"]
|
||||
@@ -155,6 +155,57 @@ def get_preferred_worker_model() -> str | None:
|
||||
return None
|
||||
|
||||
|
||||
def get_vision_fallback_model() -> str | None:
|
||||
"""Return the configured vision-fallback model, or None if not configured.
|
||||
|
||||
Reads from the ``vision_fallback`` section of ~/.hive/configuration.json.
|
||||
Used by the agent-loop hook that captions tool-result images when the
|
||||
main agent's model cannot accept image content (text-only LLMs).
|
||||
|
||||
When this returns None the fallback chain skips the configured-subagent
|
||||
stage and proceeds straight to the generic caption rotation
|
||||
(``_describe_images_as_text``).
|
||||
"""
|
||||
vision = get_hive_config().get("vision_fallback", {})
|
||||
if vision.get("provider") and vision.get("model"):
|
||||
provider = str(vision["provider"])
|
||||
model = str(vision["model"]).strip()
|
||||
if provider.lower() == "openrouter" and model.lower().startswith("openrouter/"):
|
||||
model = model[len("openrouter/") :]
|
||||
if model:
|
||||
return f"{provider}/{model}"
|
||||
return None
|
||||
|
||||
|
||||
def get_vision_fallback_api_key() -> str | None:
|
||||
"""Return the API key for the vision-fallback model.
|
||||
|
||||
Resolution order: ``vision_fallback.api_key_env_var`` from the env,
|
||||
then the default ``get_api_key()``. No subscription-token branches —
|
||||
vision fallback is intended for hosted vision models (Anthropic,
|
||||
OpenAI, Google), not for the subscription-bearer providers.
|
||||
"""
|
||||
vision = get_hive_config().get("vision_fallback", {})
|
||||
if not vision:
|
||||
return get_api_key()
|
||||
api_key_env_var = vision.get("api_key_env_var")
|
||||
if api_key_env_var:
|
||||
return os.environ.get(api_key_env_var)
|
||||
return get_api_key()
|
||||
|
||||
|
||||
def get_vision_fallback_api_base() -> str | None:
|
||||
"""Return the api_base for the vision-fallback model, or None."""
|
||||
vision = get_hive_config().get("vision_fallback", {})
|
||||
if not vision:
|
||||
return None
|
||||
if vision.get("api_base"):
|
||||
return vision["api_base"]
|
||||
if str(vision.get("provider", "")).lower() == "openrouter":
|
||||
return OPENROUTER_API_BASE
|
||||
return None
|
||||
|
||||
|
||||
def get_worker_api_key() -> str | None:
|
||||
"""Return the API key for the worker LLM, falling back to the default key."""
|
||||
worker_llm = get_hive_config().get("worker_llm", {})
|
||||
|
||||
+165
@@ -1042,6 +1042,49 @@ print(json.dumps(config, indent=2))
|
||||
PY
|
||||
}
|
||||
|
||||
save_vision_fallback() {
|
||||
# Write the `vision_fallback` block to ~/.hive/configuration.json.
|
||||
# Args: provider_id, model, env_var (api_key_env_var), api_base (optional)
|
||||
# When provider_id is empty, REMOVE the block entirely (user opted out).
|
||||
local provider_id="$1"
|
||||
local model="$2"
|
||||
local env_var="$3"
|
||||
local api_base="${4:-}"
|
||||
|
||||
uv run python - "$provider_id" "$model" "$env_var" "$api_base" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
provider_id, model, env_var, api_base = sys.argv[1:5]
|
||||
|
||||
cfg_path = Path.home() / ".hive" / "configuration.json"
|
||||
cfg_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
with open(cfg_path, encoding="utf-8-sig") as f:
|
||||
config = json.load(f)
|
||||
except (OSError, json.JSONDecodeError):
|
||||
config = {}
|
||||
|
||||
# Empty provider_id means the user opted out — drop the block.
|
||||
if not provider_id:
|
||||
config.pop("vision_fallback", None)
|
||||
else:
|
||||
block = {"provider": provider_id, "model": model}
|
||||
if env_var:
|
||||
block["api_key_env_var"] = env_var
|
||||
if api_base:
|
||||
block["api_base"] = api_base
|
||||
config["vision_fallback"] = block
|
||||
|
||||
tmp_path = cfg_path.with_name(cfg_path.name + ".tmp")
|
||||
with open(tmp_path, "w", encoding="utf-8") as f:
|
||||
json.dump(config, f, indent=2)
|
||||
tmp_path.replace(cfg_path)
|
||||
PY
|
||||
}
|
||||
|
||||
# Source shell rc file to pick up existing env vars (temporarily disable set -e)
|
||||
set +e
|
||||
if [ -f "$SHELL_RC_FILE" ]; then
|
||||
@@ -1772,6 +1815,128 @@ fi
|
||||
|
||||
echo ""
|
||||
|
||||
# ============================================================
|
||||
# Vision Fallback (subagent for tool-result images)
|
||||
# ============================================================
|
||||
#
|
||||
# When a tool returns an image (browser_screenshot, render_image, etc.)
|
||||
# but the main agent's model is text-only, the framework can route the
|
||||
# image through a separate VLM subagent that returns a text caption,
|
||||
# preserving the agent's ability to reason about visual state.
|
||||
#
|
||||
# We always offer the prompt — even for vision-capable main models —
|
||||
# so the user gets a working fallback if they ever swap to a text-only
|
||||
# model. The block is dormant for vision-capable mains (the gating
|
||||
# in agent_loop only fires for models on Hive's deny list).
|
||||
|
||||
if [ -n "$SELECTED_PROVIDER_ID" ]; then
|
||||
echo -e "${YELLOW}⬢${NC} ${BLUE}${BOLD}Vision fallback subagent${NC}"
|
||||
echo ""
|
||||
echo -e " ${DIM}When a screenshot/image tool is called from a text-only model,${NC}"
|
||||
echo -e " ${DIM}the framework can route the image through a vision-capable VLM${NC}"
|
||||
echo -e " ${DIM}and inject the caption into the conversation. Inert when your${NC}"
|
||||
echo -e " ${DIM}main model already supports vision (most do).${NC}"
|
||||
echo ""
|
||||
|
||||
# Build the candidate list from the same model_catalog.json the main
|
||||
# LLM step uses — never hardcode model IDs in this script. For each
|
||||
# provider in the catalogue, take the catalogue's default model and
|
||||
# the env var name it expects, then keep only providers the user
|
||||
# already has an API key for. Output one TSV row per candidate:
|
||||
# provider_id<TAB>model<TAB>env_var<TAB>display_name
|
||||
VISION_CANDIDATES_TSV=$(uv run python - <<'PY'
|
||||
import os
|
||||
from framework.llm.model_catalog import get_default_models, get_models_catalogue
|
||||
|
||||
# Map provider_id → the env-var name the framework reads its key from.
|
||||
# Mirrors PROVIDER_ENV_VARS at the top of quickstart.sh, plus how the
|
||||
# rest of the script picks an env var per provider.
|
||||
PROVIDER_KEY_ENV = {
|
||||
"anthropic": "ANTHROPIC_API_KEY",
|
||||
"openai": "OPENAI_API_KEY",
|
||||
"gemini": "GEMINI_API_KEY",
|
||||
"groq": "GROQ_API_KEY",
|
||||
"cerebras": "CEREBRAS_API_KEY",
|
||||
"minimax": "MINIMAX_API_KEY",
|
||||
"mistral": "MISTRAL_API_KEY",
|
||||
"together": "TOGETHER_API_KEY",
|
||||
"deepseek": "DEEPSEEK_API_KEY",
|
||||
"kimi": "KIMI_API_KEY",
|
||||
"openrouter": "OPENROUTER_API_KEY",
|
||||
}
|
||||
|
||||
defaults = get_default_models()
|
||||
catalog = get_models_catalogue()
|
||||
for provider_id, default_model in sorted(defaults.items()):
|
||||
env = PROVIDER_KEY_ENV.get(provider_id)
|
||||
if not env:
|
||||
continue
|
||||
# GEMINI_API_KEY OR GOOGLE_API_KEY both unlock gemini
|
||||
has_key = bool(os.environ.get(env))
|
||||
if provider_id == "gemini" and not has_key:
|
||||
if os.environ.get("GOOGLE_API_KEY"):
|
||||
has_key = True
|
||||
env = "GOOGLE_API_KEY"
|
||||
if not has_key:
|
||||
continue
|
||||
# Display name: provider/model from the catalogue verbatim
|
||||
display = f"{provider_id}/{default_model}"
|
||||
print(f"{provider_id}\t{default_model}\t{env}\t{display}")
|
||||
PY
|
||||
)
|
||||
|
||||
if [ -z "$VISION_CANDIDATES_TSV" ]; then
|
||||
echo -e " ${YELLOW}No matching API keys detected for any catalog provider.${NC}"
|
||||
echo -e " ${DIM}Set an API key for any provider in model_catalog.json and rerun.${NC}"
|
||||
echo -e " ${DIM}Skipping for now — text-only models will lose image content silently.${NC}"
|
||||
else
|
||||
# Materialise into bash array for selection
|
||||
VISION_CANDIDATES=()
|
||||
while IFS= read -r line; do
|
||||
[ -n "$line" ] && VISION_CANDIDATES+=("$line")
|
||||
done <<< "$VISION_CANDIDATES_TSV"
|
||||
|
||||
echo -e " ${BOLD}Available vision-fallback models${NC} ${DIM}(from model_catalog.json):${NC}"
|
||||
echo -e " ${DIM}0)${NC} (skip — don't configure vision fallback)"
|
||||
idx=1
|
||||
for entry in "${VISION_CANDIDATES[@]}"; do
|
||||
IFS=$'\t' read -r _vp _vm _vk _vd <<< "$entry"
|
||||
echo -e " ${DIM}${idx})${NC} ${_vd} ${DIM}[\$${_vk}]${NC}"
|
||||
idx=$((idx + 1))
|
||||
done
|
||||
echo ""
|
||||
VISION_CHOICE=""
|
||||
while true; do
|
||||
read -r -p " Pick a vision-fallback model [1-${#VISION_CANDIDATES[@]}, 0=skip, default=1]: " VISION_CHOICE || VISION_CHOICE=""
|
||||
VISION_CHOICE="${VISION_CHOICE:-1}"
|
||||
if [[ "$VISION_CHOICE" =~ ^[0-9]+$ ]] && \
|
||||
[ "$VISION_CHOICE" -ge 0 ] && \
|
||||
[ "$VISION_CHOICE" -le "${#VISION_CANDIDATES[@]}" ]; then
|
||||
break
|
||||
fi
|
||||
echo -e " ${YELLOW}Please enter 0 (skip) or 1-${#VISION_CANDIDATES[@]}.${NC}"
|
||||
done
|
||||
|
||||
if [ "$VISION_CHOICE" = "0" ]; then
|
||||
# Explicit skip — drop any prior block so config stays clean.
|
||||
save_vision_fallback "" "" "" "" > /dev/null 2>&1 || true
|
||||
echo -e " ${DIM}skipped — no vision_fallback block written${NC}"
|
||||
else
|
||||
chosen="${VISION_CANDIDATES[$((VISION_CHOICE - 1))]}"
|
||||
IFS=$'\t' read -r vf_provider vf_model vf_env vf_display <<< "$chosen"
|
||||
echo -n " Saving vision_fallback... "
|
||||
if save_vision_fallback "$vf_provider" "$vf_model" "$vf_env" "" > /dev/null; then
|
||||
echo -e "${GREEN}⬢${NC}"
|
||||
echo -e " ${DIM}vision_fallback: ${vf_display} (key from \$${vf_env})${NC}"
|
||||
else
|
||||
echo -e "${RED}failed${NC}"
|
||||
echo -e " ${YELLOW}Could not write vision_fallback to ~/.hive/configuration.json — non-fatal, edit manually if needed.${NC}"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# ============================================================
|
||||
# Browser Automation (GCU) — always enabled
|
||||
# ============================================================
|
||||
|
||||
Reference in New Issue
Block a user