From 2ab5e6d7843c087bc3f225d7d98fc75ddd9deffb Mon Sep 17 00:00:00 2001 From: Richard Tang Date: Fri, 24 Apr 2026 20:17:41 -0700 Subject: [PATCH] feat: model support --- core/framework/agent_loop/agent_loop.py | 22 +-- .../agent_loop/internals/vision_fallback.py | 8 +- core/framework/llm/capabilities.py | 106 ++--------- core/framework/llm/model_catalog.json | 166 +++++++++++++----- core/framework/llm/model_catalog.py | 32 ++++ core/tests/test_model_catalog.py | 25 ++- quickstart.sh | 82 +++++++-- 7 files changed, 255 insertions(+), 186 deletions(-) diff --git a/core/framework/agent_loop/agent_loop.py b/core/framework/agent_loop/agent_loop.py index 2655d0fd..a32a4e8a 100644 --- a/core/framework/agent_loop/agent_loop.py +++ b/core/framework/agent_loop/agent_loop.py @@ -228,22 +228,16 @@ def _vision_fallback_active(model: str | None) -> bool: """Return True if tool-result images for *model* should be routed through the vision-fallback chain rather than sent to the model. - Trigger: the model appears in Hive's curated text-only deny list - (``capabilities.supports_image_tool_results`` returns False). - That list is the only reliable signal — LiteLLM's - ``supports_vision`` returns False for any unknown model - (including custom-served vision-capable models like Jackrong/Qwopus3.5) - so it cannot be used as a gate; and LiteLLM's openai chat - transformer doesn't strip image blocks anyway, so passing them - through to a vision-capable but litellm-unrecognised model still - works end-to-end. + Trigger: the model's catalog entry has ``supports_vision: false`` + (resolved via :func:`capabilities.supports_image_tool_results`, + which reads ``model_catalog.json``). Unknown models default to + vision-capable, so the fallback only fires when the catalog + explicitly says the model is text-only. The ``vision_fallback`` config block is the *substitution* model — - it doesn't widen the trigger. To force fallback for a model the - deny list doesn't cover yet, add it to - ``capabilities._TEXT_ONLY_MODEL_BARE_PREFIXES`` / - ``_TEXT_ONLY_PROVIDER_PREFIXES`` rather than relying on a runtime - config. + it doesn't widen the trigger. To force fallback for a model that + isn't catalogued yet, add an entry to ``model_catalog.json`` with + ``supports_vision: false`` rather than relying on a runtime config. """ if not model: return False diff --git a/core/framework/agent_loop/internals/vision_fallback.py b/core/framework/agent_loop/internals/vision_fallback.py index acde17db..15ed2f05 100644 --- a/core/framework/agent_loop/internals/vision_fallback.py +++ b/core/framework/agent_loop/internals/vision_fallback.py @@ -1,10 +1,10 @@ """Vision-fallback subagent for tool-result images on text-only LLMs. When a tool returns image content but the main agent's model can't -accept image blocks (per ``supports_image_tool_results``), the framework -strips the images before they ever reach the LLM. Without this module, -the agent then sees only the tool's text envelope (URL, dimensions, -size) and is blind to whatever the image actually shows. +accept image blocks (i.e. its catalog entry has ``supports_vision: false``), +the framework strips the images before they ever reach the LLM. Without +this module, the agent then sees only the tool's text envelope (URL, +dimensions, size) and is blind to whatever the image actually shows. This module provides: diff --git a/core/framework/llm/capabilities.py b/core/framework/llm/capabilities.py index 8a0aaa83..3fbef191 100644 --- a/core/framework/llm/capabilities.py +++ b/core/framework/llm/capabilities.py @@ -1,114 +1,32 @@ """Model capability checks for LLM providers. -Vision support rules are derived from official vendor documentation: -- ZAI (z.ai): docs.z.ai/guides/vlm — GLM-4.6V variants are vision; GLM-5/4.6/4.7 are text-only -- MiniMax: platform.minimax.io/docs — minimax-vl-01 is vision; M2.x are text-only -- DeepSeek: api-docs.deepseek.com — deepseek-vl2 is vision; chat/reasoner are text-only -- Cerebras: inference-docs.cerebras.ai — no vision models at all -- Groq: console.groq.com/docs/vision — vision capable; treat as supported by default -- Ollama/LM Studio/vLLM/llama.cpp: local runners denied by default; model names - don't reliably indicate vision support, so users must configure explicitly +Vision support is sourced from the curated ``model_catalog.json``. Each model +entry carries an optional ``supports_vision`` boolean; unknown models default +to vision-capable so hosted frontier models work out of the box. To toggle +support for a model, edit its catalog entry rather than this file. """ from __future__ import annotations from typing import TYPE_CHECKING +from framework.llm.model_catalog import model_supports_vision + if TYPE_CHECKING: from framework.llm.provider import Tool -def _model_name(model: str) -> str: - """Return the bare model name after stripping any 'provider/' prefix.""" - if "/" in model: - return model.split("/", 1)[1] - return model - - -# Step 1: explicit vision allow-list — these always support images regardless -# of what the provider-level rules say. Checked first so that e.g. glm-4.6v -# is allowed even though glm-4.6 is denied. -_VISION_ALLOW_BARE_PREFIXES: tuple[str, ...] = ( - # ZAI/GLM vision models (docs.z.ai/guides/vlm) - "glm-4v", # GLM-4V series (legacy) - "glm-4.6v", # GLM-4.6V, GLM-4.6V-flash, GLM-4.6V-flashx - # DeepSeek vision models - "deepseek-vl", # deepseek-vl2, deepseek-vl2-small, deepseek-vl2-tiny - # MiniMax vision model - "minimax-vl", # minimax-vl-01 -) - -# Step 2: provider-level deny — every model from this provider is text-only. -_TEXT_ONLY_PROVIDER_PREFIXES: tuple[str, ...] = ( - # Cerebras: inference-docs.cerebras.ai lists only text models - "cerebras/", - # Local runners: model names don't reliably indicate vision support - "ollama/", - "ollama_chat/", - "lm_studio/", - "vllm/", - "llamacpp/", -) - -# Step 3: per-model deny — text-only models within otherwise mixed providers. -# Matched against the bare model name (provider prefix stripped, lower-cased). -# The vision allow-list above is checked first, so vision variants of the same -# family are already handled before these deny patterns are reached. -_TEXT_ONLY_MODEL_BARE_PREFIXES: tuple[str, ...] = ( - # --- ZAI / GLM family --- - # text-only: glm-5, glm-4.6, glm-4.7, glm-4.5, zai-glm-* - # vision: glm-4v, glm-4.6v (caught by allow-list above) - "glm-5", - "glm-4.6", # bare glm-4.6 is text-only; glm-4.6v is caught by allow-list - "glm-4.7", - "glm-4.5", - "zai-glm", - # --- DeepSeek --- - # text-only: deepseek-chat, deepseek-coder, deepseek-reasoner - # vision: deepseek-vl2 (caught by allow-list above) - # Note: LiteLLM's deepseek handler may flatten content lists for some models; - # VL models are allowed through and rely on LiteLLM's native VL support. - "deepseek-chat", - "deepseek-coder", - "deepseek-reasoner", - # --- MiniMax --- - # text-only: minimax-m2.*, minimax-text-*, abab* (legacy) - # vision: minimax-vl-01 (caught by allow-list above) - "minimax-m2", - "minimax-text", - "abab", -) - - def supports_image_tool_results(model: str) -> bool: """Return whether *model* can receive image content in messages. - Used to gate both user-message images and tool-result image blocks. - - Logic (checked in order): - 1. Vision allow-list → True (known vision model, skip all denies) - 2. Provider deny → False (entire provider is text-only) - 3. Model deny → False (specific text-only model within a mixed provider) - 4. Default → True (assume capable; unknown providers and models) + Thin wrapper over :func:`model_supports_vision` so existing call sites + keep working. Used to gate both user-message images and tool-result + image blocks. Empty model strings are treated as capable so the default + code path doesn't strip images before a provider is selected. """ - model_lower = model.lower() - bare = _model_name(model_lower) - - # 1. Explicit vision allow — takes priority over all denies - if any(bare.startswith(p) for p in _VISION_ALLOW_BARE_PREFIXES): + if not model: return True - - # 2. Provider-level deny (all models from this provider are text-only) - if any(model_lower.startswith(p) for p in _TEXT_ONLY_PROVIDER_PREFIXES): - return False - - # 3. Per-model deny (text-only variants within mixed-capability families) - if any(bare.startswith(p) for p in _TEXT_ONLY_MODEL_BARE_PREFIXES): - return False - - # 5. Default: assume vision capable - # Covers: OpenAI, Anthropic, Google, Mistral, Kimi, and other hosted providers - return True + return model_supports_vision(model) def filter_tools_for_model(tools: list[Tool], model: str) -> tuple[list[Tool], list[str]]: diff --git a/core/framework/llm/model_catalog.json b/core/framework/llm/model_catalog.json index e8d1320c..9929b009 100644 --- a/core/framework/llm/model_catalog.json +++ b/core/framework/llm/model_catalog.json @@ -9,47 +9,65 @@ "label": "Haiku 4.5 - Fast + cheap", "recommended": false, "max_tokens": 64000, - "max_context_tokens": 136000 + "max_context_tokens": 136000, + "supports_vision": true }, { "id": "claude-sonnet-4-5-20250929", "label": "Sonnet 4.5 - Best balance", "recommended": false, "max_tokens": 64000, - "max_context_tokens": 136000 + "max_context_tokens": 136000, + "supports_vision": true }, { "id": "claude-opus-4-6", "label": "Opus 4.6 - Most capable", "recommended": true, "max_tokens": 128000, - "max_context_tokens": 872000 + "max_context_tokens": 872000, + "supports_vision": true } ] }, "openai": { - "default_model": "gpt-5.4", + "default_model": "gpt-5.5", "models": [ { - "id": "gpt-5.4", - "label": "GPT-5.4 - Best intelligence", + "id": "gpt-5.5", + "label": "GPT-5.5 - Frontier coding + reasoning", "recommended": true, "max_tokens": 128000, - "max_context_tokens": 960000 + "max_context_tokens": 1050000, + "pricing_usd_per_mtok": { + "input": 5.00, + "output": 30.00 + }, + "supports_vision": true + }, + { + "id": "gpt-5.4", + "label": "GPT-5.4 - Previous flagship", + "recommended": false, + "max_tokens": 128000, + "max_context_tokens": 960000, + "supports_vision": true }, { "id": "gpt-5.4-mini", "label": "GPT-5.4 Mini - Faster + cheaper", "recommended": false, "max_tokens": 128000, - "max_context_tokens": 400000 + "max_context_tokens": 400000, + "supports_vision": true }, { "id": "gpt-5.4-nano", "label": "GPT-5.4 Nano - Cheapest high-volume", "recommended": false, "max_tokens": 128000, - "max_context_tokens": 400000 + "max_context_tokens": 400000, + "supports_vision": true } ] }, @@ -61,14 +79,16 @@ "label": "Gemini 3 Flash - Fast", "recommended": false, "max_tokens": 32768, - "max_context_tokens": 240000 + "max_context_tokens": 240000, + "supports_vision": true }, { "id": "gemini-3.1-pro-preview-customtools", "label": "Gemini 3.1 Pro - Best quality", "recommended": true, "max_tokens": 32768, - "max_context_tokens": 240000 + "max_context_tokens": 240000, + "supports_vision": true } ] }, @@ -80,28 +100,32 @@ "label": "GPT-OSS 120B - Best reasoning", "recommended": true, "max_tokens": 65536, - "max_context_tokens": 131072 + "max_context_tokens": 131072, + "supports_vision": false }, { "id": "openai/gpt-oss-20b", "label": "GPT-OSS 20B - Fast + cheaper", "recommended": false, "max_tokens": 65536, - "max_context_tokens": 131072 + "max_context_tokens": 131072, + "supports_vision": false }, { "id": "llama-3.3-70b-versatile", "label": "Llama 3.3 70B - General purpose", "recommended": false, "max_tokens": 32768, - "max_context_tokens": 131072 + "max_context_tokens": 131072, + "supports_vision": false }, { "id": "llama-3.1-8b-instant", "label": "Llama 3.1 8B - Fastest", "recommended": false, "max_tokens": 131072, - "max_context_tokens": 131072 + "max_context_tokens": 131072, + "supports_vision": false } ] }, @@ -113,21 +137,24 @@ "label": "GPT-OSS 120B - Best production reasoning", "recommended": true, "max_tokens": 40960, - "max_context_tokens": 131072 + "max_context_tokens": 131072, + "supports_vision": false }, { "id": "zai-glm-4.7", "label": "Z.ai GLM 4.7 - Strong coding preview", "recommended": true, "max_tokens": 40960, - "max_context_tokens": 131072 + "max_context_tokens": 131072, + "supports_vision": false }, { "id": "qwen-3-235b-a22b-instruct-2507", "label": "Qwen 3 235B Instruct - Frontier preview", "recommended": false, "max_tokens": 40960, - "max_context_tokens": 131072 + "max_context_tokens": 131072, + "supports_vision": false } ] }, @@ -143,14 +170,16 @@ "pricing_usd_per_mtok": { "input": 0.30, "output": 1.20 - } + }, + "supports_vision": false }, { "id": "MiniMax-M2.5", "label": "MiniMax M2.5 - Strong value", "recommended": false, "max_tokens": 40960, - "max_context_tokens": 180000 + "max_context_tokens": 180000, + "supports_vision": false } ] }, @@ -162,28 +191,32 @@ "label": "Mistral Large 3 - Best quality", "recommended": true, "max_tokens": 32768, - "max_context_tokens": 256000 + "max_context_tokens": 256000, + "supports_vision": true }, { "id": "mistral-medium-2508", "label": "Mistral Medium 3.1 - Balanced", "recommended": false, "max_tokens": 32768, - "max_context_tokens": 128000 + "max_context_tokens": 128000, + "supports_vision": true }, { "id": "mistral-small-2603", "label": "Mistral Small 4 - Fast + capable", "recommended": false, "max_tokens": 32768, - "max_context_tokens": 256000 + "max_context_tokens": 256000, + "supports_vision": true }, { "id": "codestral-2508", "label": "Codestral - Coding specialist", "recommended": false, "max_tokens": 32768, - "max_context_tokens": 128000 + "max_context_tokens": 128000, + "supports_vision": false } ] }, @@ -195,47 +228,71 @@ "label": "DeepSeek V3.1 - Best general coding", "recommended": true, "max_tokens": 32768, - "max_context_tokens": 128000 + "max_context_tokens": 128000, + "supports_vision": false }, { "id": "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8", "label": "Qwen3 Coder 480B - Advanced coding", "recommended": false, "max_tokens": 32768, - "max_context_tokens": 262144 + "max_context_tokens": 262144, + "supports_vision": false }, { "id": "openai/gpt-oss-120b", "label": "GPT-OSS 120B - Strong reasoning", "recommended": false, "max_tokens": 32768, - "max_context_tokens": 128000 + "max_context_tokens": 128000, + "supports_vision": false }, { "id": "meta-llama/Llama-3.3-70B-Instruct-Turbo", "label": "Llama 3.3 70B Turbo - Fast baseline", "recommended": false, "max_tokens": 32768, - "max_context_tokens": 131072 + "max_context_tokens": 131072, + "supports_vision": false } ] }, "deepseek": { - "default_model": "deepseek-chat", + "default_model": "deepseek-v4-pro", "models": [ { - "id": "deepseek-chat", - "label": "DeepSeek Chat - Fast default", + "id": "deepseek-v4-pro", + "label": "DeepSeek V4 Pro - Most capable", "recommended": true, - "max_tokens": 8192, - "max_context_tokens": 128000 + "max_tokens": 384000, + "max_context_tokens": 1000000, + "pricing_usd_per_mtok": { + "input": 1.74, + "output": 3.48, + "cache_read": 0.145 + }, + "supports_vision": false + }, + { + "id": "deepseek-v4-flash", + "label": "DeepSeek V4 Flash - Fast + cheap", + "recommended": true, + "max_tokens": 384000, + "max_context_tokens": 1000000, + "pricing_usd_per_mtok": { + "input": 0.14, + "output": 0.28, + "cache_read": 0.028 + }, + "supports_vision": false }, { "id": "deepseek-reasoner", - "label": "DeepSeek Reasoner - Deep thinking", + "label": "DeepSeek Reasoner - Legacy (deprecating)", "recommended": false, "max_tokens": 64000, - "max_context_tokens": 128000 + "max_context_tokens": 128000, + "supports_vision": false } ] }, @@ -252,7 +309,8 @@ "input": 0.60, "output": 2.50, "cache_read": 0.15 - } + }, + "supports_vision": true } ] }, @@ -264,14 +322,16 @@ "label": "Queen - Hive native", "recommended": true, "max_tokens": 32768, - "max_context_tokens": 180000 + "max_context_tokens": 180000, + "supports_vision": false }, { "id": "kimi-2.5", "label": "Kimi 2.5 - Via Hive", "recommended": false, "max_tokens": 32768, - "max_context_tokens": 240000 + "max_context_tokens": 240000, + "supports_vision": true }, { "id": "glm-5.1", @@ -284,7 +344,8 @@ "output": 4.40, "cache_read": 0.26, "cache_creation": 0.0 - } + }, + "supports_vision": false } ] }, @@ -296,42 +357,48 @@ "label": "GPT-5.4 - Best overall", "recommended": true, "max_tokens": 128000, - "max_context_tokens": 872000 + "max_context_tokens": 872000, + "supports_vision": true }, { "id": "anthropic/claude-sonnet-4.6", "label": "Claude Sonnet 4.6 - Best coding balance", "recommended": false, "max_tokens": 64000, - "max_context_tokens": 872000 + "max_context_tokens": 872000, + "supports_vision": true }, { "id": "anthropic/claude-opus-4.6", "label": "Claude Opus 4.6 - Most capable", "recommended": false, "max_tokens": 128000, - "max_context_tokens": 872000 + "max_context_tokens": 872000, + "supports_vision": true }, { "id": "google/gemini-3.1-pro-preview-customtools", "label": "Gemini 3.1 Pro Preview - Long-context reasoning", "recommended": false, "max_tokens": 32768, - "max_context_tokens": 872000 + "max_context_tokens": 872000, + "supports_vision": true }, { "id": "qwen/qwen3.6-plus", "label": "Qwen 3.6 Plus - Strong reasoning", "recommended": true, "max_tokens": 32768, - "max_context_tokens": 240000 + "max_context_tokens": 240000, + "supports_vision": false }, { "id": "z-ai/glm-5v-turbo", "label": "GLM-5V Turbo - Vision capable", "recommended": true, "max_tokens": 32768, - "max_context_tokens": 192000 + "max_context_tokens": 192000, + "supports_vision": true }, { "id": "z-ai/glm-5.1", @@ -344,7 +411,8 @@ "output": 4.40, "cache_read": 0.26, "cache_creation": 0.0 - } + }, + "supports_vision": false }, { "id": "minimax/minimax-m2.7", @@ -355,14 +423,16 @@ "pricing_usd_per_mtok": { "input": 0.30, "output": 1.20 - } + }, + "supports_vision": false }, { "id": "xiaomi/mimo-v2-pro", "label": "MiMo V2 Pro - Xiaomi multimodal", "recommended": true, "max_tokens": 64000, - "max_context_tokens": 240000 + "max_context_tokens": 240000, + "supports_vision": true } ] } diff --git a/core/framework/llm/model_catalog.py b/core/framework/llm/model_catalog.py index 75115ca5..d581206a 100644 --- a/core/framework/llm/model_catalog.py +++ b/core/framework/llm/model_catalog.py @@ -95,6 +95,10 @@ def _validate_model_catalog(data: dict[str, Any]) -> dict[str, Any]: if pricing is not None: _validate_pricing(pricing, f"{model_path}.pricing_usd_per_mtok") + supports_vision = model_map.get("supports_vision") + if supports_vision is not None and not isinstance(supports_vision, bool): + raise ModelCatalogError(f"{model_path}.supports_vision must be a boolean when present") + if not default_found: raise ModelCatalogError( f"{provider_path}.default_model={default_model!r} is not present in {provider_path}.models" @@ -229,6 +233,34 @@ def get_model_pricing(model_id: str) -> dict[str, float] | None: return None +def model_supports_vision(model_id: str) -> bool: + """Return whether *model_id* supports image inputs per the curated catalog. + + Looks up the bare model id (and the provider-prefix-stripped form) in the + catalog. Returns the model's ``supports_vision`` flag when found, defaulting + to ``True`` for unknown models or when the flag is absent — assume vision + capable for hosted providers, since modern frontier models support images + by default and the captioning fallback is more expensive than just letting + the provider handle the image. + """ + if not model_id: + return True + + candidates = [model_id] + if "/" in model_id: + candidates.append(model_id.split("/", 1)[1]) + + for candidate in candidates: + for provider_info in load_model_catalog()["providers"].values(): + for model in provider_info["models"]: + if model["id"] == candidate: + flag = model.get("supports_vision") + if isinstance(flag, bool): + return flag + return True + return True + + def get_preset(preset_id: str) -> dict[str, Any] | None: """Return one preset entry.""" preset = load_model_catalog()["presets"].get(preset_id) diff --git a/core/tests/test_model_catalog.py b/core/tests/test_model_catalog.py index 09cff94b..ed965006 100644 --- a/core/tests/test_model_catalog.py +++ b/core/tests/test_model_catalog.py @@ -24,12 +24,12 @@ def test_default_models_exist_in_each_provider_catalogue(): def test_find_model_returns_curated_token_limits(): - model = model_catalog.find_model("openai", "gpt-5.4") + model = model_catalog.find_model("openai", "gpt-5.5") assert model is not None - assert model["label"] == "GPT-5.4 - Best intelligence" + assert model["label"] == "GPT-5.5 - Frontier coding + reasoning" assert model["max_tokens"] == 128000 - assert model["max_context_tokens"] == 960000 + assert model["max_context_tokens"] == 1050000 def test_anthropic_curated_limits_track_documented_caps_with_safe_input_budget(): @@ -125,15 +125,22 @@ def test_deepseek_catalog_tracks_current_api_models(): deepseek_default = model_catalog.get_default_models()["deepseek"] deepseek_models = model_catalog.get_models_catalogue()["deepseek"] - assert deepseek_default == "deepseek-chat" + assert deepseek_default == "deepseek-v4-pro" assert [model["id"] for model in deepseek_models] == [ - "deepseek-chat", + "deepseek-v4-pro", + "deepseek-v4-flash", "deepseek-reasoner", ] - assert deepseek_models[0]["max_tokens"] == 8192 - assert deepseek_models[0]["max_context_tokens"] == 128000 - assert deepseek_models[1]["max_tokens"] == 64000 - assert deepseek_models[1]["max_context_tokens"] == 128000 + # V4 family — 1M context, 384k max output, mirrors api-docs.deepseek.com pricing. + assert deepseek_models[0]["max_tokens"] == 384000 + assert deepseek_models[0]["max_context_tokens"] == 1000000 + assert deepseek_models[0]["pricing_usd_per_mtok"]["input"] == 1.74 + assert deepseek_models[0]["pricing_usd_per_mtok"]["output"] == 3.48 + assert deepseek_models[1]["pricing_usd_per_mtok"]["input"] == 0.14 + assert deepseek_models[1]["pricing_usd_per_mtok"]["output"] == 0.28 + # Legacy reasoner kept for back-compat while users migrate. + assert deepseek_models[2]["max_tokens"] == 64000 + assert deepseek_models[2]["max_context_tokens"] == 128000 def test_openrouter_catalog_tracks_current_frontier_set(): diff --git a/quickstart.sh b/quickstart.sh index 1ecbab06..5510044a 100755 --- a/quickstart.sh +++ b/quickstart.sh @@ -1352,9 +1352,11 @@ fi echo "" echo -e " ${CYAN}${BOLD}API key providers:${NC}" -# 8-13) API key providers — show (credential detected) if key already set -PROVIDER_MENU_ENVS=(ANTHROPIC_API_KEY OPENAI_API_KEY GEMINI_API_KEY GROQ_API_KEY CEREBRAS_API_KEY OPENROUTER_API_KEY) -PROVIDER_MENU_NAMES=("Anthropic (Claude) - Recommended" "OpenAI (GPT)" "Google Gemini - Free tier available" "Groq - Fast, free tier" "Cerebras - Fast, free tier" "OpenRouter - Bring any OpenRouter model") +# 8-N) API key providers — show (credential detected) if key already set. +# Order is reflected directly in the menu numbering; the case dispatcher +# below resolves choice numbers via $((8 + index_in_arrays)). +PROVIDER_MENU_ENVS=(ANTHROPIC_API_KEY OPENAI_API_KEY GEMINI_API_KEY GROQ_API_KEY CEREBRAS_API_KEY OPENROUTER_API_KEY DEEPSEEK_API_KEY) +PROVIDER_MENU_NAMES=("Anthropic (Claude) - Recommended" "OpenAI (GPT)" "Google Gemini - Free tier available" "Groq - Fast, free tier" "Cerebras - Fast, free tier" "OpenRouter - Bring any OpenRouter model" "DeepSeek - V4 family") for idx in "${!PROVIDER_MENU_ENVS[@]}"; do num=$((idx + 8)) env_var="${PROVIDER_MENU_ENVS[$idx]}" @@ -1365,14 +1367,16 @@ for idx in "${!PROVIDER_MENU_ENVS[@]}"; do fi done -# 14) Local (Ollama) — no API key needed +# Local (Ollama) — slot computed from the provider list so adding/removing +# API-key providers above doesn't require renumbering by hand. +OLLAMA_CHOICE=$((8 + ${#PROVIDER_MENU_ENVS[@]})) if [ "$OLLAMA_DETECTED" = true ]; then - echo -e " ${CYAN}14)${NC} Local (Ollama) - No API key needed ${GREEN}(ollama detected)${NC}" + echo -e " ${CYAN}$OLLAMA_CHOICE)${NC} Local (Ollama) - No API key needed ${GREEN}(ollama detected)${NC}" else - echo -e " ${CYAN}14)${NC} Local (Ollama) - No API key needed" + echo -e " ${CYAN}$OLLAMA_CHOICE)${NC} Local (Ollama) - No API key needed" fi -SKIP_CHOICE=$((8 + ${#PROVIDER_MENU_ENVS[@]} + 1)) +SKIP_CHOICE=$((OLLAMA_CHOICE + 1)) echo -e " ${CYAN}$SKIP_CHOICE)${NC} Skip for now" echo "" @@ -1578,6 +1582,13 @@ case $choice in SIGNUP_URL="https://openrouter.ai/keys" ;; 14) + SELECTED_ENV_VAR="DEEPSEEK_API_KEY" + SELECTED_PROVIDER_ID="deepseek" + SELECTED_API_BASE="https://api.deepseek.com" + PROVIDER_NAME="DeepSeek" + SIGNUP_URL="https://platform.deepseek.com/api_keys" + ;; + "$OLLAMA_CHOICE") # Local (Ollama) — no API key; pick model from ollama list if [ "$OLLAMA_DETECTED" != true ]; then echo "" @@ -1824,12 +1835,29 @@ echo "" # image through a separate VLM subagent that returns a text caption, # preserving the agent's ability to reason about visual state. # -# We always offer the prompt — even for vision-capable main models — -# so the user gets a working fallback if they ever swap to a text-only -# model. The block is dormant for vision-capable mains (the gating -# in agent_loop only fires for models on Hive's deny list). +# Skip entirely when the chosen main model already supports vision per +# the catalog's ``supports_vision`` flag — the fallback would never fire +# in that case, and prompting for it just adds friction. For text-only +# mains we still offer the prompt so the user can wire up a captioning +# subagent. -if [ -n "$SELECTED_PROVIDER_ID" ]; then +MAIN_MODEL_HAS_VISION="false" +if [ -n "$SELECTED_MODEL" ]; then + MAIN_MODEL_HAS_VISION=$(uv run python - "$SELECTED_MODEL" <<'PY' 2>/dev/null || echo "false" +import sys +from framework.llm.model_catalog import model_supports_vision +print("true" if model_supports_vision(sys.argv[1]) else "false") +PY +) +fi + +if [ -n "$SELECTED_PROVIDER_ID" ] && [ "$MAIN_MODEL_HAS_VISION" = "true" ]; then + # Drop any stale vision_fallback block so the config reflects the + # current main model's capabilities. + save_vision_fallback "" "" "" "" > /dev/null 2>&1 || true + echo -e "${GREEN}⬢${NC} Vision fallback ${DIM}skipped — ${SELECTED_MODEL} already supports vision${NC}" + echo "" +elif [ -n "$SELECTED_PROVIDER_ID" ]; then echo -e "${YELLOW}⬢${NC} ${BLUE}${BOLD}Vision fallback subagent${NC}" echo "" echo -e " ${DIM}When a screenshot/image tool is called from a text-only model,${NC}" @@ -1840,9 +1868,13 @@ if [ -n "$SELECTED_PROVIDER_ID" ]; then # Build the candidate list from the same model_catalog.json the main # LLM step uses — never hardcode model IDs in this script. For each - # provider in the catalogue, take the catalogue's default model and - # the env var name it expects, then keep only providers the user - # already has an API key for. Output one TSV row per candidate: + # provider in the catalogue, pick a model whose ``supports_vision`` + # flag is true (since the fallback subagent's whole purpose is to + # caption images — a text-only candidate would be useless). Prefer + # the provider's default when it supports vision, otherwise fall + # back to the first vision-capable model in the provider's list. + # Skip the provider entirely if no model in its catalog supports + # vision. Output one TSV row per candidate: # provider_idmodelenv_vardisplay_name VISION_CANDIDATES_TSV=$(uv run python - <<'PY' import os @@ -1879,9 +1911,25 @@ for provider_id, default_model in sorted(defaults.items()): env = "GOOGLE_API_KEY" if not has_key: continue + # Pick a vision-capable model: prefer the catalog default if it has + # supports_vision=true, else the first vision-capable model in the + # provider's list. Skip the provider if none exist. + models = catalog.get(provider_id, []) + chosen = None + for m in models: + if m["id"] == default_model and m.get("supports_vision") is True: + chosen = m["id"] + break + if chosen is None: + for m in models: + if m.get("supports_vision") is True: + chosen = m["id"] + break + if chosen is None: + continue # Display name: provider/model from the catalogue verbatim - display = f"{provider_id}/{default_model}" - print(f"{provider_id}\t{default_model}\t{env}\t{display}") + display = f"{provider_id}/{chosen}" + print(f"{provider_id}\t{chosen}\t{env}\t{display}") PY )