feat: model support

This commit is contained in:
Richard Tang
2026-04-24 20:17:41 -07:00
parent e7f9b7d791
commit 2ab5e6d784
7 changed files with 255 additions and 186 deletions
+8 -14
View File
@@ -228,22 +228,16 @@ def _vision_fallback_active(model: str | None) -> bool:
"""Return True if tool-result images for *model* should be routed
through the vision-fallback chain rather than sent to the model.
Trigger: the model appears in Hive's curated text-only deny list
(``capabilities.supports_image_tool_results`` returns False).
That list is the only reliable signal LiteLLM's
``supports_vision`` returns False for any unknown model
(including custom-served vision-capable models like Jackrong/Qwopus3.5)
so it cannot be used as a gate; and LiteLLM's openai chat
transformer doesn't strip image blocks anyway, so passing them
through to a vision-capable but litellm-unrecognised model still
works end-to-end.
Trigger: the model's catalog entry has ``supports_vision: false``
(resolved via :func:`capabilities.supports_image_tool_results`,
which reads ``model_catalog.json``). Unknown models default to
vision-capable, so the fallback only fires when the catalog
explicitly says the model is text-only.
The ``vision_fallback`` config block is the *substitution* model
it doesn't widen the trigger. To force fallback for a model the
deny list doesn't cover yet, add it to
``capabilities._TEXT_ONLY_MODEL_BARE_PREFIXES`` /
``_TEXT_ONLY_PROVIDER_PREFIXES`` rather than relying on a runtime
config.
it doesn't widen the trigger. To force fallback for a model that
isn't catalogued yet, add an entry to ``model_catalog.json`` with
``supports_vision: false`` rather than relying on a runtime config.
"""
if not model:
return False
@@ -1,10 +1,10 @@
"""Vision-fallback subagent for tool-result images on text-only LLMs.
When a tool returns image content but the main agent's model can't
accept image blocks (per ``supports_image_tool_results``), the framework
strips the images before they ever reach the LLM. Without this module,
the agent then sees only the tool's text envelope (URL, dimensions,
size) and is blind to whatever the image actually shows.
accept image blocks (i.e. its catalog entry has ``supports_vision: false``),
the framework strips the images before they ever reach the LLM. Without
this module, the agent then sees only the tool's text envelope (URL,
dimensions, size) and is blind to whatever the image actually shows.
This module provides:
+12 -94
View File
@@ -1,114 +1,32 @@
"""Model capability checks for LLM providers.
Vision support rules are derived from official vendor documentation:
- ZAI (z.ai): docs.z.ai/guides/vlm GLM-4.6V variants are vision; GLM-5/4.6/4.7 are text-only
- MiniMax: platform.minimax.io/docs minimax-vl-01 is vision; M2.x are text-only
- DeepSeek: api-docs.deepseek.com deepseek-vl2 is vision; chat/reasoner are text-only
- Cerebras: inference-docs.cerebras.ai no vision models at all
- Groq: console.groq.com/docs/vision vision capable; treat as supported by default
- Ollama/LM Studio/vLLM/llama.cpp: local runners denied by default; model names
don't reliably indicate vision support, so users must configure explicitly
Vision support is sourced from the curated ``model_catalog.json``. Each model
entry carries an optional ``supports_vision`` boolean; unknown models default
to vision-capable so hosted frontier models work out of the box. To toggle
support for a model, edit its catalog entry rather than this file.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from framework.llm.model_catalog import model_supports_vision
if TYPE_CHECKING:
from framework.llm.provider import Tool
def _model_name(model: str) -> str:
"""Return the bare model name after stripping any 'provider/' prefix."""
if "/" in model:
return model.split("/", 1)[1]
return model
# Step 1: explicit vision allow-list — these always support images regardless
# of what the provider-level rules say. Checked first so that e.g. glm-4.6v
# is allowed even though glm-4.6 is denied.
_VISION_ALLOW_BARE_PREFIXES: tuple[str, ...] = (
# ZAI/GLM vision models (docs.z.ai/guides/vlm)
"glm-4v", # GLM-4V series (legacy)
"glm-4.6v", # GLM-4.6V, GLM-4.6V-flash, GLM-4.6V-flashx
# DeepSeek vision models
"deepseek-vl", # deepseek-vl2, deepseek-vl2-small, deepseek-vl2-tiny
# MiniMax vision model
"minimax-vl", # minimax-vl-01
)
# Step 2: provider-level deny — every model from this provider is text-only.
_TEXT_ONLY_PROVIDER_PREFIXES: tuple[str, ...] = (
# Cerebras: inference-docs.cerebras.ai lists only text models
"cerebras/",
# Local runners: model names don't reliably indicate vision support
"ollama/",
"ollama_chat/",
"lm_studio/",
"vllm/",
"llamacpp/",
)
# Step 3: per-model deny — text-only models within otherwise mixed providers.
# Matched against the bare model name (provider prefix stripped, lower-cased).
# The vision allow-list above is checked first, so vision variants of the same
# family are already handled before these deny patterns are reached.
_TEXT_ONLY_MODEL_BARE_PREFIXES: tuple[str, ...] = (
# --- ZAI / GLM family ---
# text-only: glm-5, glm-4.6, glm-4.7, glm-4.5, zai-glm-*
# vision: glm-4v, glm-4.6v (caught by allow-list above)
"glm-5",
"glm-4.6", # bare glm-4.6 is text-only; glm-4.6v is caught by allow-list
"glm-4.7",
"glm-4.5",
"zai-glm",
# --- DeepSeek ---
# text-only: deepseek-chat, deepseek-coder, deepseek-reasoner
# vision: deepseek-vl2 (caught by allow-list above)
# Note: LiteLLM's deepseek handler may flatten content lists for some models;
# VL models are allowed through and rely on LiteLLM's native VL support.
"deepseek-chat",
"deepseek-coder",
"deepseek-reasoner",
# --- MiniMax ---
# text-only: minimax-m2.*, minimax-text-*, abab* (legacy)
# vision: minimax-vl-01 (caught by allow-list above)
"minimax-m2",
"minimax-text",
"abab",
)
def supports_image_tool_results(model: str) -> bool:
"""Return whether *model* can receive image content in messages.
Used to gate both user-message images and tool-result image blocks.
Logic (checked in order):
1. Vision allow-list True (known vision model, skip all denies)
2. Provider deny False (entire provider is text-only)
3. Model deny False (specific text-only model within a mixed provider)
4. Default True (assume capable; unknown providers and models)
Thin wrapper over :func:`model_supports_vision` so existing call sites
keep working. Used to gate both user-message images and tool-result
image blocks. Empty model strings are treated as capable so the default
code path doesn't strip images before a provider is selected.
"""
model_lower = model.lower()
bare = _model_name(model_lower)
# 1. Explicit vision allow — takes priority over all denies
if any(bare.startswith(p) for p in _VISION_ALLOW_BARE_PREFIXES):
if not model:
return True
# 2. Provider-level deny (all models from this provider are text-only)
if any(model_lower.startswith(p) for p in _TEXT_ONLY_PROVIDER_PREFIXES):
return False
# 3. Per-model deny (text-only variants within mixed-capability families)
if any(bare.startswith(p) for p in _TEXT_ONLY_MODEL_BARE_PREFIXES):
return False
# 5. Default: assume vision capable
# Covers: OpenAI, Anthropic, Google, Mistral, Kimi, and other hosted providers
return True
return model_supports_vision(model)
def filter_tools_for_model(tools: list[Tool], model: str) -> tuple[list[Tool], list[str]]:
+118 -48
View File
@@ -9,47 +9,65 @@
"label": "Haiku 4.5 - Fast + cheap",
"recommended": false,
"max_tokens": 64000,
"max_context_tokens": 136000
"max_context_tokens": 136000,
"supports_vision": true
},
{
"id": "claude-sonnet-4-5-20250929",
"label": "Sonnet 4.5 - Best balance",
"recommended": false,
"max_tokens": 64000,
"max_context_tokens": 136000
"max_context_tokens": 136000,
"supports_vision": true
},
{
"id": "claude-opus-4-6",
"label": "Opus 4.6 - Most capable",
"recommended": true,
"max_tokens": 128000,
"max_context_tokens": 872000
"max_context_tokens": 872000,
"supports_vision": true
}
]
},
"openai": {
"default_model": "gpt-5.4",
"default_model": "gpt-5.5",
"models": [
{
"id": "gpt-5.4",
"label": "GPT-5.4 - Best intelligence",
"id": "gpt-5.5",
"label": "GPT-5.5 - Frontier coding + reasoning",
"recommended": true,
"max_tokens": 128000,
"max_context_tokens": 960000
"max_context_tokens": 1050000,
"pricing_usd_per_mtok": {
"input": 5.00,
"output": 30.00
},
"supports_vision": true
},
{
"id": "gpt-5.4",
"label": "GPT-5.4 - Previous flagship",
"recommended": false,
"max_tokens": 128000,
"max_context_tokens": 960000,
"supports_vision": true
},
{
"id": "gpt-5.4-mini",
"label": "GPT-5.4 Mini - Faster + cheaper",
"recommended": false,
"max_tokens": 128000,
"max_context_tokens": 400000
"max_context_tokens": 400000,
"supports_vision": true
},
{
"id": "gpt-5.4-nano",
"label": "GPT-5.4 Nano - Cheapest high-volume",
"recommended": false,
"max_tokens": 128000,
"max_context_tokens": 400000
"max_context_tokens": 400000,
"supports_vision": true
}
]
},
@@ -61,14 +79,16 @@
"label": "Gemini 3 Flash - Fast",
"recommended": false,
"max_tokens": 32768,
"max_context_tokens": 240000
"max_context_tokens": 240000,
"supports_vision": true
},
{
"id": "gemini-3.1-pro-preview-customtools",
"label": "Gemini 3.1 Pro - Best quality",
"recommended": true,
"max_tokens": 32768,
"max_context_tokens": 240000
"max_context_tokens": 240000,
"supports_vision": true
}
]
},
@@ -80,28 +100,32 @@
"label": "GPT-OSS 120B - Best reasoning",
"recommended": true,
"max_tokens": 65536,
"max_context_tokens": 131072
"max_context_tokens": 131072,
"supports_vision": false
},
{
"id": "openai/gpt-oss-20b",
"label": "GPT-OSS 20B - Fast + cheaper",
"recommended": false,
"max_tokens": 65536,
"max_context_tokens": 131072
"max_context_tokens": 131072,
"supports_vision": false
},
{
"id": "llama-3.3-70b-versatile",
"label": "Llama 3.3 70B - General purpose",
"recommended": false,
"max_tokens": 32768,
"max_context_tokens": 131072
"max_context_tokens": 131072,
"supports_vision": false
},
{
"id": "llama-3.1-8b-instant",
"label": "Llama 3.1 8B - Fastest",
"recommended": false,
"max_tokens": 131072,
"max_context_tokens": 131072
"max_context_tokens": 131072,
"supports_vision": false
}
]
},
@@ -113,21 +137,24 @@
"label": "GPT-OSS 120B - Best production reasoning",
"recommended": true,
"max_tokens": 40960,
"max_context_tokens": 131072
"max_context_tokens": 131072,
"supports_vision": false
},
{
"id": "zai-glm-4.7",
"label": "Z.ai GLM 4.7 - Strong coding preview",
"recommended": true,
"max_tokens": 40960,
"max_context_tokens": 131072
"max_context_tokens": 131072,
"supports_vision": false
},
{
"id": "qwen-3-235b-a22b-instruct-2507",
"label": "Qwen 3 235B Instruct - Frontier preview",
"recommended": false,
"max_tokens": 40960,
"max_context_tokens": 131072
"max_context_tokens": 131072,
"supports_vision": false
}
]
},
@@ -143,14 +170,16 @@
"pricing_usd_per_mtok": {
"input": 0.30,
"output": 1.20
}
},
"supports_vision": false
},
{
"id": "MiniMax-M2.5",
"label": "MiniMax M2.5 - Strong value",
"recommended": false,
"max_tokens": 40960,
"max_context_tokens": 180000
"max_context_tokens": 180000,
"supports_vision": false
}
]
},
@@ -162,28 +191,32 @@
"label": "Mistral Large 3 - Best quality",
"recommended": true,
"max_tokens": 32768,
"max_context_tokens": 256000
"max_context_tokens": 256000,
"supports_vision": true
},
{
"id": "mistral-medium-2508",
"label": "Mistral Medium 3.1 - Balanced",
"recommended": false,
"max_tokens": 32768,
"max_context_tokens": 128000
"max_context_tokens": 128000,
"supports_vision": true
},
{
"id": "mistral-small-2603",
"label": "Mistral Small 4 - Fast + capable",
"recommended": false,
"max_tokens": 32768,
"max_context_tokens": 256000
"max_context_tokens": 256000,
"supports_vision": true
},
{
"id": "codestral-2508",
"label": "Codestral - Coding specialist",
"recommended": false,
"max_tokens": 32768,
"max_context_tokens": 128000
"max_context_tokens": 128000,
"supports_vision": false
}
]
},
@@ -195,47 +228,71 @@
"label": "DeepSeek V3.1 - Best general coding",
"recommended": true,
"max_tokens": 32768,
"max_context_tokens": 128000
"max_context_tokens": 128000,
"supports_vision": false
},
{
"id": "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8",
"label": "Qwen3 Coder 480B - Advanced coding",
"recommended": false,
"max_tokens": 32768,
"max_context_tokens": 262144
"max_context_tokens": 262144,
"supports_vision": false
},
{
"id": "openai/gpt-oss-120b",
"label": "GPT-OSS 120B - Strong reasoning",
"recommended": false,
"max_tokens": 32768,
"max_context_tokens": 128000
"max_context_tokens": 128000,
"supports_vision": false
},
{
"id": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
"label": "Llama 3.3 70B Turbo - Fast baseline",
"recommended": false,
"max_tokens": 32768,
"max_context_tokens": 131072
"max_context_tokens": 131072,
"supports_vision": false
}
]
},
"deepseek": {
"default_model": "deepseek-chat",
"default_model": "deepseek-v4-pro",
"models": [
{
"id": "deepseek-chat",
"label": "DeepSeek Chat - Fast default",
"id": "deepseek-v4-pro",
"label": "DeepSeek V4 Pro - Most capable",
"recommended": true,
"max_tokens": 8192,
"max_context_tokens": 128000
"max_tokens": 384000,
"max_context_tokens": 1000000,
"pricing_usd_per_mtok": {
"input": 1.74,
"output": 3.48,
"cache_read": 0.145
},
"supports_vision": false
},
{
"id": "deepseek-v4-flash",
"label": "DeepSeek V4 Flash - Fast + cheap",
"recommended": true,
"max_tokens": 384000,
"max_context_tokens": 1000000,
"pricing_usd_per_mtok": {
"input": 0.14,
"output": 0.28,
"cache_read": 0.028
},
"supports_vision": false
},
{
"id": "deepseek-reasoner",
"label": "DeepSeek Reasoner - Deep thinking",
"label": "DeepSeek Reasoner - Legacy (deprecating)",
"recommended": false,
"max_tokens": 64000,
"max_context_tokens": 128000
"max_context_tokens": 128000,
"supports_vision": false
}
]
},
@@ -252,7 +309,8 @@
"input": 0.60,
"output": 2.50,
"cache_read": 0.15
}
},
"supports_vision": true
}
]
},
@@ -264,14 +322,16 @@
"label": "Queen - Hive native",
"recommended": true,
"max_tokens": 32768,
"max_context_tokens": 180000
"max_context_tokens": 180000,
"supports_vision": false
},
{
"id": "kimi-2.5",
"label": "Kimi 2.5 - Via Hive",
"recommended": false,
"max_tokens": 32768,
"max_context_tokens": 240000
"max_context_tokens": 240000,
"supports_vision": true
},
{
"id": "glm-5.1",
@@ -284,7 +344,8 @@
"output": 4.40,
"cache_read": 0.26,
"cache_creation": 0.0
}
},
"supports_vision": false
}
]
},
@@ -296,42 +357,48 @@
"label": "GPT-5.4 - Best overall",
"recommended": true,
"max_tokens": 128000,
"max_context_tokens": 872000
"max_context_tokens": 872000,
"supports_vision": true
},
{
"id": "anthropic/claude-sonnet-4.6",
"label": "Claude Sonnet 4.6 - Best coding balance",
"recommended": false,
"max_tokens": 64000,
"max_context_tokens": 872000
"max_context_tokens": 872000,
"supports_vision": true
},
{
"id": "anthropic/claude-opus-4.6",
"label": "Claude Opus 4.6 - Most capable",
"recommended": false,
"max_tokens": 128000,
"max_context_tokens": 872000
"max_context_tokens": 872000,
"supports_vision": true
},
{
"id": "google/gemini-3.1-pro-preview-customtools",
"label": "Gemini 3.1 Pro Preview - Long-context reasoning",
"recommended": false,
"max_tokens": 32768,
"max_context_tokens": 872000
"max_context_tokens": 872000,
"supports_vision": true
},
{
"id": "qwen/qwen3.6-plus",
"label": "Qwen 3.6 Plus - Strong reasoning",
"recommended": true,
"max_tokens": 32768,
"max_context_tokens": 240000
"max_context_tokens": 240000,
"supports_vision": false
},
{
"id": "z-ai/glm-5v-turbo",
"label": "GLM-5V Turbo - Vision capable",
"recommended": true,
"max_tokens": 32768,
"max_context_tokens": 192000
"max_context_tokens": 192000,
"supports_vision": true
},
{
"id": "z-ai/glm-5.1",
@@ -344,7 +411,8 @@
"output": 4.40,
"cache_read": 0.26,
"cache_creation": 0.0
}
},
"supports_vision": false
},
{
"id": "minimax/minimax-m2.7",
@@ -355,14 +423,16 @@
"pricing_usd_per_mtok": {
"input": 0.30,
"output": 1.20
}
},
"supports_vision": false
},
{
"id": "xiaomi/mimo-v2-pro",
"label": "MiMo V2 Pro - Xiaomi multimodal",
"recommended": true,
"max_tokens": 64000,
"max_context_tokens": 240000
"max_context_tokens": 240000,
"supports_vision": true
}
]
}
+32
View File
@@ -95,6 +95,10 @@ def _validate_model_catalog(data: dict[str, Any]) -> dict[str, Any]:
if pricing is not None:
_validate_pricing(pricing, f"{model_path}.pricing_usd_per_mtok")
supports_vision = model_map.get("supports_vision")
if supports_vision is not None and not isinstance(supports_vision, bool):
raise ModelCatalogError(f"{model_path}.supports_vision must be a boolean when present")
if not default_found:
raise ModelCatalogError(
f"{provider_path}.default_model={default_model!r} is not present in {provider_path}.models"
@@ -229,6 +233,34 @@ def get_model_pricing(model_id: str) -> dict[str, float] | None:
return None
def model_supports_vision(model_id: str) -> bool:
"""Return whether *model_id* supports image inputs per the curated catalog.
Looks up the bare model id (and the provider-prefix-stripped form) in the
catalog. Returns the model's ``supports_vision`` flag when found, defaulting
to ``True`` for unknown models or when the flag is absent assume vision
capable for hosted providers, since modern frontier models support images
by default and the captioning fallback is more expensive than just letting
the provider handle the image.
"""
if not model_id:
return True
candidates = [model_id]
if "/" in model_id:
candidates.append(model_id.split("/", 1)[1])
for candidate in candidates:
for provider_info in load_model_catalog()["providers"].values():
for model in provider_info["models"]:
if model["id"] == candidate:
flag = model.get("supports_vision")
if isinstance(flag, bool):
return flag
return True
return True
def get_preset(preset_id: str) -> dict[str, Any] | None:
"""Return one preset entry."""
preset = load_model_catalog()["presets"].get(preset_id)
+16 -9
View File
@@ -24,12 +24,12 @@ def test_default_models_exist_in_each_provider_catalogue():
def test_find_model_returns_curated_token_limits():
model = model_catalog.find_model("openai", "gpt-5.4")
model = model_catalog.find_model("openai", "gpt-5.5")
assert model is not None
assert model["label"] == "GPT-5.4 - Best intelligence"
assert model["label"] == "GPT-5.5 - Frontier coding + reasoning"
assert model["max_tokens"] == 128000
assert model["max_context_tokens"] == 960000
assert model["max_context_tokens"] == 1050000
def test_anthropic_curated_limits_track_documented_caps_with_safe_input_budget():
@@ -125,15 +125,22 @@ def test_deepseek_catalog_tracks_current_api_models():
deepseek_default = model_catalog.get_default_models()["deepseek"]
deepseek_models = model_catalog.get_models_catalogue()["deepseek"]
assert deepseek_default == "deepseek-chat"
assert deepseek_default == "deepseek-v4-pro"
assert [model["id"] for model in deepseek_models] == [
"deepseek-chat",
"deepseek-v4-pro",
"deepseek-v4-flash",
"deepseek-reasoner",
]
assert deepseek_models[0]["max_tokens"] == 8192
assert deepseek_models[0]["max_context_tokens"] == 128000
assert deepseek_models[1]["max_tokens"] == 64000
assert deepseek_models[1]["max_context_tokens"] == 128000
# V4 family — 1M context, 384k max output, mirrors api-docs.deepseek.com pricing.
assert deepseek_models[0]["max_tokens"] == 384000
assert deepseek_models[0]["max_context_tokens"] == 1000000
assert deepseek_models[0]["pricing_usd_per_mtok"]["input"] == 1.74
assert deepseek_models[0]["pricing_usd_per_mtok"]["output"] == 3.48
assert deepseek_models[1]["pricing_usd_per_mtok"]["input"] == 0.14
assert deepseek_models[1]["pricing_usd_per_mtok"]["output"] == 0.28
# Legacy reasoner kept for back-compat while users migrate.
assert deepseek_models[2]["max_tokens"] == 64000
assert deepseek_models[2]["max_context_tokens"] == 128000
def test_openrouter_catalog_tracks_current_frontier_set():
+65 -17
View File
@@ -1352,9 +1352,11 @@ fi
echo ""
echo -e " ${CYAN}${BOLD}API key providers:${NC}"
# 8-13) API key providers — show (credential detected) if key already set
PROVIDER_MENU_ENVS=(ANTHROPIC_API_KEY OPENAI_API_KEY GEMINI_API_KEY GROQ_API_KEY CEREBRAS_API_KEY OPENROUTER_API_KEY)
PROVIDER_MENU_NAMES=("Anthropic (Claude) - Recommended" "OpenAI (GPT)" "Google Gemini - Free tier available" "Groq - Fast, free tier" "Cerebras - Fast, free tier" "OpenRouter - Bring any OpenRouter model")
# 8-N) API key providers — show (credential detected) if key already set.
# Order is reflected directly in the menu numbering; the case dispatcher
# below resolves choice numbers via $((8 + index_in_arrays)).
PROVIDER_MENU_ENVS=(ANTHROPIC_API_KEY OPENAI_API_KEY GEMINI_API_KEY GROQ_API_KEY CEREBRAS_API_KEY OPENROUTER_API_KEY DEEPSEEK_API_KEY)
PROVIDER_MENU_NAMES=("Anthropic (Claude) - Recommended" "OpenAI (GPT)" "Google Gemini - Free tier available" "Groq - Fast, free tier" "Cerebras - Fast, free tier" "OpenRouter - Bring any OpenRouter model" "DeepSeek - V4 family")
for idx in "${!PROVIDER_MENU_ENVS[@]}"; do
num=$((idx + 8))
env_var="${PROVIDER_MENU_ENVS[$idx]}"
@@ -1365,14 +1367,16 @@ for idx in "${!PROVIDER_MENU_ENVS[@]}"; do
fi
done
# 14) Local (Ollama) — no API key needed
# Local (Ollama) — slot computed from the provider list so adding/removing
# API-key providers above doesn't require renumbering by hand.
OLLAMA_CHOICE=$((8 + ${#PROVIDER_MENU_ENVS[@]}))
if [ "$OLLAMA_DETECTED" = true ]; then
echo -e " ${CYAN}14)${NC} Local (Ollama) - No API key needed ${GREEN}(ollama detected)${NC}"
echo -e " ${CYAN}$OLLAMA_CHOICE)${NC} Local (Ollama) - No API key needed ${GREEN}(ollama detected)${NC}"
else
echo -e " ${CYAN}14)${NC} Local (Ollama) - No API key needed"
echo -e " ${CYAN}$OLLAMA_CHOICE)${NC} Local (Ollama) - No API key needed"
fi
SKIP_CHOICE=$((8 + ${#PROVIDER_MENU_ENVS[@]} + 1))
SKIP_CHOICE=$((OLLAMA_CHOICE + 1))
echo -e " ${CYAN}$SKIP_CHOICE)${NC} Skip for now"
echo ""
@@ -1578,6 +1582,13 @@ case $choice in
SIGNUP_URL="https://openrouter.ai/keys"
;;
14)
SELECTED_ENV_VAR="DEEPSEEK_API_KEY"
SELECTED_PROVIDER_ID="deepseek"
SELECTED_API_BASE="https://api.deepseek.com"
PROVIDER_NAME="DeepSeek"
SIGNUP_URL="https://platform.deepseek.com/api_keys"
;;
"$OLLAMA_CHOICE")
# Local (Ollama) — no API key; pick model from ollama list
if [ "$OLLAMA_DETECTED" != true ]; then
echo ""
@@ -1824,12 +1835,29 @@ echo ""
# image through a separate VLM subagent that returns a text caption,
# preserving the agent's ability to reason about visual state.
#
# We always offer the prompt — even for vision-capable main models —
# so the user gets a working fallback if they ever swap to a text-only
# model. The block is dormant for vision-capable mains (the gating
# in agent_loop only fires for models on Hive's deny list).
# Skip entirely when the chosen main model already supports vision per
# the catalog's ``supports_vision`` flag — the fallback would never fire
# in that case, and prompting for it just adds friction. For text-only
# mains we still offer the prompt so the user can wire up a captioning
# subagent.
if [ -n "$SELECTED_PROVIDER_ID" ]; then
MAIN_MODEL_HAS_VISION="false"
if [ -n "$SELECTED_MODEL" ]; then
MAIN_MODEL_HAS_VISION=$(uv run python - "$SELECTED_MODEL" <<'PY' 2>/dev/null || echo "false"
import sys
from framework.llm.model_catalog import model_supports_vision
print("true" if model_supports_vision(sys.argv[1]) else "false")
PY
)
fi
if [ -n "$SELECTED_PROVIDER_ID" ] && [ "$MAIN_MODEL_HAS_VISION" = "true" ]; then
# Drop any stale vision_fallback block so the config reflects the
# current main model's capabilities.
save_vision_fallback "" "" "" "" > /dev/null 2>&1 || true
echo -e "${GREEN}${NC} Vision fallback ${DIM}skipped — ${SELECTED_MODEL} already supports vision${NC}"
echo ""
elif [ -n "$SELECTED_PROVIDER_ID" ]; then
echo -e "${YELLOW}${NC} ${BLUE}${BOLD}Vision fallback subagent${NC}"
echo ""
echo -e " ${DIM}When a screenshot/image tool is called from a text-only model,${NC}"
@@ -1840,9 +1868,13 @@ if [ -n "$SELECTED_PROVIDER_ID" ]; then
# Build the candidate list from the same model_catalog.json the main
# LLM step uses — never hardcode model IDs in this script. For each
# provider in the catalogue, take the catalogue's default model and
# the env var name it expects, then keep only providers the user
# already has an API key for. Output one TSV row per candidate:
# provider in the catalogue, pick a model whose ``supports_vision``
# flag is true (since the fallback subagent's whole purpose is to
# caption images — a text-only candidate would be useless). Prefer
# the provider's default when it supports vision, otherwise fall
# back to the first vision-capable model in the provider's list.
# Skip the provider entirely if no model in its catalog supports
# vision. Output one TSV row per candidate:
# provider_id<TAB>model<TAB>env_var<TAB>display_name
VISION_CANDIDATES_TSV=$(uv run python - <<'PY'
import os
@@ -1879,9 +1911,25 @@ for provider_id, default_model in sorted(defaults.items()):
env = "GOOGLE_API_KEY"
if not has_key:
continue
# Pick a vision-capable model: prefer the catalog default if it has
# supports_vision=true, else the first vision-capable model in the
# provider's list. Skip the provider if none exist.
models = catalog.get(provider_id, [])
chosen = None
for m in models:
if m["id"] == default_model and m.get("supports_vision") is True:
chosen = m["id"]
break
if chosen is None:
for m in models:
if m.get("supports_vision") is True:
chosen = m["id"]
break
if chosen is None:
continue
# Display name: provider/model from the catalogue verbatim
display = f"{provider_id}/{default_model}"
print(f"{provider_id}\t{default_model}\t{env}\t{display}")
display = f"{provider_id}/{chosen}"
print(f"{provider_id}\t{chosen}\t{env}\t{display}")
PY
)