feat: model support

This commit is contained in:
Richard Tang
2026-04-24 20:17:41 -07:00
parent e7f9b7d791
commit 2ab5e6d784
7 changed files with 255 additions and 186 deletions
+8 -14
View File
@@ -228,22 +228,16 @@ def _vision_fallback_active(model: str | None) -> bool:
"""Return True if tool-result images for *model* should be routed """Return True if tool-result images for *model* should be routed
through the vision-fallback chain rather than sent to the model. through the vision-fallback chain rather than sent to the model.
Trigger: the model appears in Hive's curated text-only deny list Trigger: the model's catalog entry has ``supports_vision: false``
(``capabilities.supports_image_tool_results`` returns False). (resolved via :func:`capabilities.supports_image_tool_results`,
That list is the only reliable signal LiteLLM's which reads ``model_catalog.json``). Unknown models default to
``supports_vision`` returns False for any unknown model vision-capable, so the fallback only fires when the catalog
(including custom-served vision-capable models like Jackrong/Qwopus3.5) explicitly says the model is text-only.
so it cannot be used as a gate; and LiteLLM's openai chat
transformer doesn't strip image blocks anyway, so passing them
through to a vision-capable but litellm-unrecognised model still
works end-to-end.
The ``vision_fallback`` config block is the *substitution* model The ``vision_fallback`` config block is the *substitution* model
it doesn't widen the trigger. To force fallback for a model the it doesn't widen the trigger. To force fallback for a model that
deny list doesn't cover yet, add it to isn't catalogued yet, add an entry to ``model_catalog.json`` with
``capabilities._TEXT_ONLY_MODEL_BARE_PREFIXES`` / ``supports_vision: false`` rather than relying on a runtime config.
``_TEXT_ONLY_PROVIDER_PREFIXES`` rather than relying on a runtime
config.
""" """
if not model: if not model:
return False return False
@@ -1,10 +1,10 @@
"""Vision-fallback subagent for tool-result images on text-only LLMs. """Vision-fallback subagent for tool-result images on text-only LLMs.
When a tool returns image content but the main agent's model can't When a tool returns image content but the main agent's model can't
accept image blocks (per ``supports_image_tool_results``), the framework accept image blocks (i.e. its catalog entry has ``supports_vision: false``),
strips the images before they ever reach the LLM. Without this module, the framework strips the images before they ever reach the LLM. Without
the agent then sees only the tool's text envelope (URL, dimensions, this module, the agent then sees only the tool's text envelope (URL,
size) and is blind to whatever the image actually shows. dimensions, size) and is blind to whatever the image actually shows.
This module provides: This module provides:
+12 -94
View File
@@ -1,114 +1,32 @@
"""Model capability checks for LLM providers. """Model capability checks for LLM providers.
Vision support rules are derived from official vendor documentation: Vision support is sourced from the curated ``model_catalog.json``. Each model
- ZAI (z.ai): docs.z.ai/guides/vlm GLM-4.6V variants are vision; GLM-5/4.6/4.7 are text-only entry carries an optional ``supports_vision`` boolean; unknown models default
- MiniMax: platform.minimax.io/docs minimax-vl-01 is vision; M2.x are text-only to vision-capable so hosted frontier models work out of the box. To toggle
- DeepSeek: api-docs.deepseek.com deepseek-vl2 is vision; chat/reasoner are text-only support for a model, edit its catalog entry rather than this file.
- Cerebras: inference-docs.cerebras.ai no vision models at all
- Groq: console.groq.com/docs/vision vision capable; treat as supported by default
- Ollama/LM Studio/vLLM/llama.cpp: local runners denied by default; model names
don't reliably indicate vision support, so users must configure explicitly
""" """
from __future__ import annotations from __future__ import annotations
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from framework.llm.model_catalog import model_supports_vision
if TYPE_CHECKING: if TYPE_CHECKING:
from framework.llm.provider import Tool from framework.llm.provider import Tool
def _model_name(model: str) -> str:
"""Return the bare model name after stripping any 'provider/' prefix."""
if "/" in model:
return model.split("/", 1)[1]
return model
# Step 1: explicit vision allow-list — these always support images regardless
# of what the provider-level rules say. Checked first so that e.g. glm-4.6v
# is allowed even though glm-4.6 is denied.
_VISION_ALLOW_BARE_PREFIXES: tuple[str, ...] = (
# ZAI/GLM vision models (docs.z.ai/guides/vlm)
"glm-4v", # GLM-4V series (legacy)
"glm-4.6v", # GLM-4.6V, GLM-4.6V-flash, GLM-4.6V-flashx
# DeepSeek vision models
"deepseek-vl", # deepseek-vl2, deepseek-vl2-small, deepseek-vl2-tiny
# MiniMax vision model
"minimax-vl", # minimax-vl-01
)
# Step 2: provider-level deny — every model from this provider is text-only.
_TEXT_ONLY_PROVIDER_PREFIXES: tuple[str, ...] = (
# Cerebras: inference-docs.cerebras.ai lists only text models
"cerebras/",
# Local runners: model names don't reliably indicate vision support
"ollama/",
"ollama_chat/",
"lm_studio/",
"vllm/",
"llamacpp/",
)
# Step 3: per-model deny — text-only models within otherwise mixed providers.
# Matched against the bare model name (provider prefix stripped, lower-cased).
# The vision allow-list above is checked first, so vision variants of the same
# family are already handled before these deny patterns are reached.
_TEXT_ONLY_MODEL_BARE_PREFIXES: tuple[str, ...] = (
# --- ZAI / GLM family ---
# text-only: glm-5, glm-4.6, glm-4.7, glm-4.5, zai-glm-*
# vision: glm-4v, glm-4.6v (caught by allow-list above)
"glm-5",
"glm-4.6", # bare glm-4.6 is text-only; glm-4.6v is caught by allow-list
"glm-4.7",
"glm-4.5",
"zai-glm",
# --- DeepSeek ---
# text-only: deepseek-chat, deepseek-coder, deepseek-reasoner
# vision: deepseek-vl2 (caught by allow-list above)
# Note: LiteLLM's deepseek handler may flatten content lists for some models;
# VL models are allowed through and rely on LiteLLM's native VL support.
"deepseek-chat",
"deepseek-coder",
"deepseek-reasoner",
# --- MiniMax ---
# text-only: minimax-m2.*, minimax-text-*, abab* (legacy)
# vision: minimax-vl-01 (caught by allow-list above)
"minimax-m2",
"minimax-text",
"abab",
)
def supports_image_tool_results(model: str) -> bool: def supports_image_tool_results(model: str) -> bool:
"""Return whether *model* can receive image content in messages. """Return whether *model* can receive image content in messages.
Used to gate both user-message images and tool-result image blocks. Thin wrapper over :func:`model_supports_vision` so existing call sites
keep working. Used to gate both user-message images and tool-result
Logic (checked in order): image blocks. Empty model strings are treated as capable so the default
1. Vision allow-list True (known vision model, skip all denies) code path doesn't strip images before a provider is selected.
2. Provider deny False (entire provider is text-only)
3. Model deny False (specific text-only model within a mixed provider)
4. Default True (assume capable; unknown providers and models)
""" """
model_lower = model.lower() if not model:
bare = _model_name(model_lower)
# 1. Explicit vision allow — takes priority over all denies
if any(bare.startswith(p) for p in _VISION_ALLOW_BARE_PREFIXES):
return True return True
return model_supports_vision(model)
# 2. Provider-level deny (all models from this provider are text-only)
if any(model_lower.startswith(p) for p in _TEXT_ONLY_PROVIDER_PREFIXES):
return False
# 3. Per-model deny (text-only variants within mixed-capability families)
if any(bare.startswith(p) for p in _TEXT_ONLY_MODEL_BARE_PREFIXES):
return False
# 5. Default: assume vision capable
# Covers: OpenAI, Anthropic, Google, Mistral, Kimi, and other hosted providers
return True
def filter_tools_for_model(tools: list[Tool], model: str) -> tuple[list[Tool], list[str]]: def filter_tools_for_model(tools: list[Tool], model: str) -> tuple[list[Tool], list[str]]:
+118 -48
View File
@@ -9,47 +9,65 @@
"label": "Haiku 4.5 - Fast + cheap", "label": "Haiku 4.5 - Fast + cheap",
"recommended": false, "recommended": false,
"max_tokens": 64000, "max_tokens": 64000,
"max_context_tokens": 136000 "max_context_tokens": 136000,
"supports_vision": true
}, },
{ {
"id": "claude-sonnet-4-5-20250929", "id": "claude-sonnet-4-5-20250929",
"label": "Sonnet 4.5 - Best balance", "label": "Sonnet 4.5 - Best balance",
"recommended": false, "recommended": false,
"max_tokens": 64000, "max_tokens": 64000,
"max_context_tokens": 136000 "max_context_tokens": 136000,
"supports_vision": true
}, },
{ {
"id": "claude-opus-4-6", "id": "claude-opus-4-6",
"label": "Opus 4.6 - Most capable", "label": "Opus 4.6 - Most capable",
"recommended": true, "recommended": true,
"max_tokens": 128000, "max_tokens": 128000,
"max_context_tokens": 872000 "max_context_tokens": 872000,
"supports_vision": true
} }
] ]
}, },
"openai": { "openai": {
"default_model": "gpt-5.4", "default_model": "gpt-5.5",
"models": [ "models": [
{ {
"id": "gpt-5.4", "id": "gpt-5.5",
"label": "GPT-5.4 - Best intelligence", "label": "GPT-5.5 - Frontier coding + reasoning",
"recommended": true, "recommended": true,
"max_tokens": 128000, "max_tokens": 128000,
"max_context_tokens": 960000 "max_context_tokens": 1050000,
"pricing_usd_per_mtok": {
"input": 5.00,
"output": 30.00
},
"supports_vision": true
},
{
"id": "gpt-5.4",
"label": "GPT-5.4 - Previous flagship",
"recommended": false,
"max_tokens": 128000,
"max_context_tokens": 960000,
"supports_vision": true
}, },
{ {
"id": "gpt-5.4-mini", "id": "gpt-5.4-mini",
"label": "GPT-5.4 Mini - Faster + cheaper", "label": "GPT-5.4 Mini - Faster + cheaper",
"recommended": false, "recommended": false,
"max_tokens": 128000, "max_tokens": 128000,
"max_context_tokens": 400000 "max_context_tokens": 400000,
"supports_vision": true
}, },
{ {
"id": "gpt-5.4-nano", "id": "gpt-5.4-nano",
"label": "GPT-5.4 Nano - Cheapest high-volume", "label": "GPT-5.4 Nano - Cheapest high-volume",
"recommended": false, "recommended": false,
"max_tokens": 128000, "max_tokens": 128000,
"max_context_tokens": 400000 "max_context_tokens": 400000,
"supports_vision": true
} }
] ]
}, },
@@ -61,14 +79,16 @@
"label": "Gemini 3 Flash - Fast", "label": "Gemini 3 Flash - Fast",
"recommended": false, "recommended": false,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 240000 "max_context_tokens": 240000,
"supports_vision": true
}, },
{ {
"id": "gemini-3.1-pro-preview-customtools", "id": "gemini-3.1-pro-preview-customtools",
"label": "Gemini 3.1 Pro - Best quality", "label": "Gemini 3.1 Pro - Best quality",
"recommended": true, "recommended": true,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 240000 "max_context_tokens": 240000,
"supports_vision": true
} }
] ]
}, },
@@ -80,28 +100,32 @@
"label": "GPT-OSS 120B - Best reasoning", "label": "GPT-OSS 120B - Best reasoning",
"recommended": true, "recommended": true,
"max_tokens": 65536, "max_tokens": 65536,
"max_context_tokens": 131072 "max_context_tokens": 131072,
"supports_vision": false
}, },
{ {
"id": "openai/gpt-oss-20b", "id": "openai/gpt-oss-20b",
"label": "GPT-OSS 20B - Fast + cheaper", "label": "GPT-OSS 20B - Fast + cheaper",
"recommended": false, "recommended": false,
"max_tokens": 65536, "max_tokens": 65536,
"max_context_tokens": 131072 "max_context_tokens": 131072,
"supports_vision": false
}, },
{ {
"id": "llama-3.3-70b-versatile", "id": "llama-3.3-70b-versatile",
"label": "Llama 3.3 70B - General purpose", "label": "Llama 3.3 70B - General purpose",
"recommended": false, "recommended": false,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 131072 "max_context_tokens": 131072,
"supports_vision": false
}, },
{ {
"id": "llama-3.1-8b-instant", "id": "llama-3.1-8b-instant",
"label": "Llama 3.1 8B - Fastest", "label": "Llama 3.1 8B - Fastest",
"recommended": false, "recommended": false,
"max_tokens": 131072, "max_tokens": 131072,
"max_context_tokens": 131072 "max_context_tokens": 131072,
"supports_vision": false
} }
] ]
}, },
@@ -113,21 +137,24 @@
"label": "GPT-OSS 120B - Best production reasoning", "label": "GPT-OSS 120B - Best production reasoning",
"recommended": true, "recommended": true,
"max_tokens": 40960, "max_tokens": 40960,
"max_context_tokens": 131072 "max_context_tokens": 131072,
"supports_vision": false
}, },
{ {
"id": "zai-glm-4.7", "id": "zai-glm-4.7",
"label": "Z.ai GLM 4.7 - Strong coding preview", "label": "Z.ai GLM 4.7 - Strong coding preview",
"recommended": true, "recommended": true,
"max_tokens": 40960, "max_tokens": 40960,
"max_context_tokens": 131072 "max_context_tokens": 131072,
"supports_vision": false
}, },
{ {
"id": "qwen-3-235b-a22b-instruct-2507", "id": "qwen-3-235b-a22b-instruct-2507",
"label": "Qwen 3 235B Instruct - Frontier preview", "label": "Qwen 3 235B Instruct - Frontier preview",
"recommended": false, "recommended": false,
"max_tokens": 40960, "max_tokens": 40960,
"max_context_tokens": 131072 "max_context_tokens": 131072,
"supports_vision": false
} }
] ]
}, },
@@ -143,14 +170,16 @@
"pricing_usd_per_mtok": { "pricing_usd_per_mtok": {
"input": 0.30, "input": 0.30,
"output": 1.20 "output": 1.20
} },
"supports_vision": false
}, },
{ {
"id": "MiniMax-M2.5", "id": "MiniMax-M2.5",
"label": "MiniMax M2.5 - Strong value", "label": "MiniMax M2.5 - Strong value",
"recommended": false, "recommended": false,
"max_tokens": 40960, "max_tokens": 40960,
"max_context_tokens": 180000 "max_context_tokens": 180000,
"supports_vision": false
} }
] ]
}, },
@@ -162,28 +191,32 @@
"label": "Mistral Large 3 - Best quality", "label": "Mistral Large 3 - Best quality",
"recommended": true, "recommended": true,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 256000 "max_context_tokens": 256000,
"supports_vision": true
}, },
{ {
"id": "mistral-medium-2508", "id": "mistral-medium-2508",
"label": "Mistral Medium 3.1 - Balanced", "label": "Mistral Medium 3.1 - Balanced",
"recommended": false, "recommended": false,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 128000 "max_context_tokens": 128000,
"supports_vision": true
}, },
{ {
"id": "mistral-small-2603", "id": "mistral-small-2603",
"label": "Mistral Small 4 - Fast + capable", "label": "Mistral Small 4 - Fast + capable",
"recommended": false, "recommended": false,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 256000 "max_context_tokens": 256000,
"supports_vision": true
}, },
{ {
"id": "codestral-2508", "id": "codestral-2508",
"label": "Codestral - Coding specialist", "label": "Codestral - Coding specialist",
"recommended": false, "recommended": false,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 128000 "max_context_tokens": 128000,
"supports_vision": false
} }
] ]
}, },
@@ -195,47 +228,71 @@
"label": "DeepSeek V3.1 - Best general coding", "label": "DeepSeek V3.1 - Best general coding",
"recommended": true, "recommended": true,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 128000 "max_context_tokens": 128000,
"supports_vision": false
}, },
{ {
"id": "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8", "id": "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8",
"label": "Qwen3 Coder 480B - Advanced coding", "label": "Qwen3 Coder 480B - Advanced coding",
"recommended": false, "recommended": false,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 262144 "max_context_tokens": 262144,
"supports_vision": false
}, },
{ {
"id": "openai/gpt-oss-120b", "id": "openai/gpt-oss-120b",
"label": "GPT-OSS 120B - Strong reasoning", "label": "GPT-OSS 120B - Strong reasoning",
"recommended": false, "recommended": false,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 128000 "max_context_tokens": 128000,
"supports_vision": false
}, },
{ {
"id": "meta-llama/Llama-3.3-70B-Instruct-Turbo", "id": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
"label": "Llama 3.3 70B Turbo - Fast baseline", "label": "Llama 3.3 70B Turbo - Fast baseline",
"recommended": false, "recommended": false,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 131072 "max_context_tokens": 131072,
"supports_vision": false
} }
] ]
}, },
"deepseek": { "deepseek": {
"default_model": "deepseek-chat", "default_model": "deepseek-v4-pro",
"models": [ "models": [
{ {
"id": "deepseek-chat", "id": "deepseek-v4-pro",
"label": "DeepSeek Chat - Fast default", "label": "DeepSeek V4 Pro - Most capable",
"recommended": true, "recommended": true,
"max_tokens": 8192, "max_tokens": 384000,
"max_context_tokens": 128000 "max_context_tokens": 1000000,
"pricing_usd_per_mtok": {
"input": 1.74,
"output": 3.48,
"cache_read": 0.145
},
"supports_vision": false
},
{
"id": "deepseek-v4-flash",
"label": "DeepSeek V4 Flash - Fast + cheap",
"recommended": true,
"max_tokens": 384000,
"max_context_tokens": 1000000,
"pricing_usd_per_mtok": {
"input": 0.14,
"output": 0.28,
"cache_read": 0.028
},
"supports_vision": false
}, },
{ {
"id": "deepseek-reasoner", "id": "deepseek-reasoner",
"label": "DeepSeek Reasoner - Deep thinking", "label": "DeepSeek Reasoner - Legacy (deprecating)",
"recommended": false, "recommended": false,
"max_tokens": 64000, "max_tokens": 64000,
"max_context_tokens": 128000 "max_context_tokens": 128000,
"supports_vision": false
} }
] ]
}, },
@@ -252,7 +309,8 @@
"input": 0.60, "input": 0.60,
"output": 2.50, "output": 2.50,
"cache_read": 0.15 "cache_read": 0.15
} },
"supports_vision": true
} }
] ]
}, },
@@ -264,14 +322,16 @@
"label": "Queen - Hive native", "label": "Queen - Hive native",
"recommended": true, "recommended": true,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 180000 "max_context_tokens": 180000,
"supports_vision": false
}, },
{ {
"id": "kimi-2.5", "id": "kimi-2.5",
"label": "Kimi 2.5 - Via Hive", "label": "Kimi 2.5 - Via Hive",
"recommended": false, "recommended": false,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 240000 "max_context_tokens": 240000,
"supports_vision": true
}, },
{ {
"id": "glm-5.1", "id": "glm-5.1",
@@ -284,7 +344,8 @@
"output": 4.40, "output": 4.40,
"cache_read": 0.26, "cache_read": 0.26,
"cache_creation": 0.0 "cache_creation": 0.0
} },
"supports_vision": false
} }
] ]
}, },
@@ -296,42 +357,48 @@
"label": "GPT-5.4 - Best overall", "label": "GPT-5.4 - Best overall",
"recommended": true, "recommended": true,
"max_tokens": 128000, "max_tokens": 128000,
"max_context_tokens": 872000 "max_context_tokens": 872000,
"supports_vision": true
}, },
{ {
"id": "anthropic/claude-sonnet-4.6", "id": "anthropic/claude-sonnet-4.6",
"label": "Claude Sonnet 4.6 - Best coding balance", "label": "Claude Sonnet 4.6 - Best coding balance",
"recommended": false, "recommended": false,
"max_tokens": 64000, "max_tokens": 64000,
"max_context_tokens": 872000 "max_context_tokens": 872000,
"supports_vision": true
}, },
{ {
"id": "anthropic/claude-opus-4.6", "id": "anthropic/claude-opus-4.6",
"label": "Claude Opus 4.6 - Most capable", "label": "Claude Opus 4.6 - Most capable",
"recommended": false, "recommended": false,
"max_tokens": 128000, "max_tokens": 128000,
"max_context_tokens": 872000 "max_context_tokens": 872000,
"supports_vision": true
}, },
{ {
"id": "google/gemini-3.1-pro-preview-customtools", "id": "google/gemini-3.1-pro-preview-customtools",
"label": "Gemini 3.1 Pro Preview - Long-context reasoning", "label": "Gemini 3.1 Pro Preview - Long-context reasoning",
"recommended": false, "recommended": false,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 872000 "max_context_tokens": 872000,
"supports_vision": true
}, },
{ {
"id": "qwen/qwen3.6-plus", "id": "qwen/qwen3.6-plus",
"label": "Qwen 3.6 Plus - Strong reasoning", "label": "Qwen 3.6 Plus - Strong reasoning",
"recommended": true, "recommended": true,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 240000 "max_context_tokens": 240000,
"supports_vision": false
}, },
{ {
"id": "z-ai/glm-5v-turbo", "id": "z-ai/glm-5v-turbo",
"label": "GLM-5V Turbo - Vision capable", "label": "GLM-5V Turbo - Vision capable",
"recommended": true, "recommended": true,
"max_tokens": 32768, "max_tokens": 32768,
"max_context_tokens": 192000 "max_context_tokens": 192000,
"supports_vision": true
}, },
{ {
"id": "z-ai/glm-5.1", "id": "z-ai/glm-5.1",
@@ -344,7 +411,8 @@
"output": 4.40, "output": 4.40,
"cache_read": 0.26, "cache_read": 0.26,
"cache_creation": 0.0 "cache_creation": 0.0
} },
"supports_vision": false
}, },
{ {
"id": "minimax/minimax-m2.7", "id": "minimax/minimax-m2.7",
@@ -355,14 +423,16 @@
"pricing_usd_per_mtok": { "pricing_usd_per_mtok": {
"input": 0.30, "input": 0.30,
"output": 1.20 "output": 1.20
} },
"supports_vision": false
}, },
{ {
"id": "xiaomi/mimo-v2-pro", "id": "xiaomi/mimo-v2-pro",
"label": "MiMo V2 Pro - Xiaomi multimodal", "label": "MiMo V2 Pro - Xiaomi multimodal",
"recommended": true, "recommended": true,
"max_tokens": 64000, "max_tokens": 64000,
"max_context_tokens": 240000 "max_context_tokens": 240000,
"supports_vision": true
} }
] ]
} }
+32
View File
@@ -95,6 +95,10 @@ def _validate_model_catalog(data: dict[str, Any]) -> dict[str, Any]:
if pricing is not None: if pricing is not None:
_validate_pricing(pricing, f"{model_path}.pricing_usd_per_mtok") _validate_pricing(pricing, f"{model_path}.pricing_usd_per_mtok")
supports_vision = model_map.get("supports_vision")
if supports_vision is not None and not isinstance(supports_vision, bool):
raise ModelCatalogError(f"{model_path}.supports_vision must be a boolean when present")
if not default_found: if not default_found:
raise ModelCatalogError( raise ModelCatalogError(
f"{provider_path}.default_model={default_model!r} is not present in {provider_path}.models" f"{provider_path}.default_model={default_model!r} is not present in {provider_path}.models"
@@ -229,6 +233,34 @@ def get_model_pricing(model_id: str) -> dict[str, float] | None:
return None return None
def model_supports_vision(model_id: str) -> bool:
"""Return whether *model_id* supports image inputs per the curated catalog.
Looks up the bare model id (and the provider-prefix-stripped form) in the
catalog. Returns the model's ``supports_vision`` flag when found, defaulting
to ``True`` for unknown models or when the flag is absent assume vision
capable for hosted providers, since modern frontier models support images
by default and the captioning fallback is more expensive than just letting
the provider handle the image.
"""
if not model_id:
return True
candidates = [model_id]
if "/" in model_id:
candidates.append(model_id.split("/", 1)[1])
for candidate in candidates:
for provider_info in load_model_catalog()["providers"].values():
for model in provider_info["models"]:
if model["id"] == candidate:
flag = model.get("supports_vision")
if isinstance(flag, bool):
return flag
return True
return True
def get_preset(preset_id: str) -> dict[str, Any] | None: def get_preset(preset_id: str) -> dict[str, Any] | None:
"""Return one preset entry.""" """Return one preset entry."""
preset = load_model_catalog()["presets"].get(preset_id) preset = load_model_catalog()["presets"].get(preset_id)
+16 -9
View File
@@ -24,12 +24,12 @@ def test_default_models_exist_in_each_provider_catalogue():
def test_find_model_returns_curated_token_limits(): def test_find_model_returns_curated_token_limits():
model = model_catalog.find_model("openai", "gpt-5.4") model = model_catalog.find_model("openai", "gpt-5.5")
assert model is not None assert model is not None
assert model["label"] == "GPT-5.4 - Best intelligence" assert model["label"] == "GPT-5.5 - Frontier coding + reasoning"
assert model["max_tokens"] == 128000 assert model["max_tokens"] == 128000
assert model["max_context_tokens"] == 960000 assert model["max_context_tokens"] == 1050000
def test_anthropic_curated_limits_track_documented_caps_with_safe_input_budget(): def test_anthropic_curated_limits_track_documented_caps_with_safe_input_budget():
@@ -125,15 +125,22 @@ def test_deepseek_catalog_tracks_current_api_models():
deepseek_default = model_catalog.get_default_models()["deepseek"] deepseek_default = model_catalog.get_default_models()["deepseek"]
deepseek_models = model_catalog.get_models_catalogue()["deepseek"] deepseek_models = model_catalog.get_models_catalogue()["deepseek"]
assert deepseek_default == "deepseek-chat" assert deepseek_default == "deepseek-v4-pro"
assert [model["id"] for model in deepseek_models] == [ assert [model["id"] for model in deepseek_models] == [
"deepseek-chat", "deepseek-v4-pro",
"deepseek-v4-flash",
"deepseek-reasoner", "deepseek-reasoner",
] ]
assert deepseek_models[0]["max_tokens"] == 8192 # V4 family — 1M context, 384k max output, mirrors api-docs.deepseek.com pricing.
assert deepseek_models[0]["max_context_tokens"] == 128000 assert deepseek_models[0]["max_tokens"] == 384000
assert deepseek_models[1]["max_tokens"] == 64000 assert deepseek_models[0]["max_context_tokens"] == 1000000
assert deepseek_models[1]["max_context_tokens"] == 128000 assert deepseek_models[0]["pricing_usd_per_mtok"]["input"] == 1.74
assert deepseek_models[0]["pricing_usd_per_mtok"]["output"] == 3.48
assert deepseek_models[1]["pricing_usd_per_mtok"]["input"] == 0.14
assert deepseek_models[1]["pricing_usd_per_mtok"]["output"] == 0.28
# Legacy reasoner kept for back-compat while users migrate.
assert deepseek_models[2]["max_tokens"] == 64000
assert deepseek_models[2]["max_context_tokens"] == 128000
def test_openrouter_catalog_tracks_current_frontier_set(): def test_openrouter_catalog_tracks_current_frontier_set():
+65 -17
View File
@@ -1352,9 +1352,11 @@ fi
echo "" echo ""
echo -e " ${CYAN}${BOLD}API key providers:${NC}" echo -e " ${CYAN}${BOLD}API key providers:${NC}"
# 8-13) API key providers — show (credential detected) if key already set # 8-N) API key providers — show (credential detected) if key already set.
PROVIDER_MENU_ENVS=(ANTHROPIC_API_KEY OPENAI_API_KEY GEMINI_API_KEY GROQ_API_KEY CEREBRAS_API_KEY OPENROUTER_API_KEY) # Order is reflected directly in the menu numbering; the case dispatcher
PROVIDER_MENU_NAMES=("Anthropic (Claude) - Recommended" "OpenAI (GPT)" "Google Gemini - Free tier available" "Groq - Fast, free tier" "Cerebras - Fast, free tier" "OpenRouter - Bring any OpenRouter model") # below resolves choice numbers via $((8 + index_in_arrays)).
PROVIDER_MENU_ENVS=(ANTHROPIC_API_KEY OPENAI_API_KEY GEMINI_API_KEY GROQ_API_KEY CEREBRAS_API_KEY OPENROUTER_API_KEY DEEPSEEK_API_KEY)
PROVIDER_MENU_NAMES=("Anthropic (Claude) - Recommended" "OpenAI (GPT)" "Google Gemini - Free tier available" "Groq - Fast, free tier" "Cerebras - Fast, free tier" "OpenRouter - Bring any OpenRouter model" "DeepSeek - V4 family")
for idx in "${!PROVIDER_MENU_ENVS[@]}"; do for idx in "${!PROVIDER_MENU_ENVS[@]}"; do
num=$((idx + 8)) num=$((idx + 8))
env_var="${PROVIDER_MENU_ENVS[$idx]}" env_var="${PROVIDER_MENU_ENVS[$idx]}"
@@ -1365,14 +1367,16 @@ for idx in "${!PROVIDER_MENU_ENVS[@]}"; do
fi fi
done done
# 14) Local (Ollama) — no API key needed # Local (Ollama) — slot computed from the provider list so adding/removing
# API-key providers above doesn't require renumbering by hand.
OLLAMA_CHOICE=$((8 + ${#PROVIDER_MENU_ENVS[@]}))
if [ "$OLLAMA_DETECTED" = true ]; then if [ "$OLLAMA_DETECTED" = true ]; then
echo -e " ${CYAN}14)${NC} Local (Ollama) - No API key needed ${GREEN}(ollama detected)${NC}" echo -e " ${CYAN}$OLLAMA_CHOICE)${NC} Local (Ollama) - No API key needed ${GREEN}(ollama detected)${NC}"
else else
echo -e " ${CYAN}14)${NC} Local (Ollama) - No API key needed" echo -e " ${CYAN}$OLLAMA_CHOICE)${NC} Local (Ollama) - No API key needed"
fi fi
SKIP_CHOICE=$((8 + ${#PROVIDER_MENU_ENVS[@]} + 1)) SKIP_CHOICE=$((OLLAMA_CHOICE + 1))
echo -e " ${CYAN}$SKIP_CHOICE)${NC} Skip for now" echo -e " ${CYAN}$SKIP_CHOICE)${NC} Skip for now"
echo "" echo ""
@@ -1578,6 +1582,13 @@ case $choice in
SIGNUP_URL="https://openrouter.ai/keys" SIGNUP_URL="https://openrouter.ai/keys"
;; ;;
14) 14)
SELECTED_ENV_VAR="DEEPSEEK_API_KEY"
SELECTED_PROVIDER_ID="deepseek"
SELECTED_API_BASE="https://api.deepseek.com"
PROVIDER_NAME="DeepSeek"
SIGNUP_URL="https://platform.deepseek.com/api_keys"
;;
"$OLLAMA_CHOICE")
# Local (Ollama) — no API key; pick model from ollama list # Local (Ollama) — no API key; pick model from ollama list
if [ "$OLLAMA_DETECTED" != true ]; then if [ "$OLLAMA_DETECTED" != true ]; then
echo "" echo ""
@@ -1824,12 +1835,29 @@ echo ""
# image through a separate VLM subagent that returns a text caption, # image through a separate VLM subagent that returns a text caption,
# preserving the agent's ability to reason about visual state. # preserving the agent's ability to reason about visual state.
# #
# We always offer the prompt — even for vision-capable main models — # Skip entirely when the chosen main model already supports vision per
# so the user gets a working fallback if they ever swap to a text-only # the catalog's ``supports_vision`` flag — the fallback would never fire
# model. The block is dormant for vision-capable mains (the gating # in that case, and prompting for it just adds friction. For text-only
# in agent_loop only fires for models on Hive's deny list). # mains we still offer the prompt so the user can wire up a captioning
# subagent.
if [ -n "$SELECTED_PROVIDER_ID" ]; then MAIN_MODEL_HAS_VISION="false"
if [ -n "$SELECTED_MODEL" ]; then
MAIN_MODEL_HAS_VISION=$(uv run python - "$SELECTED_MODEL" <<'PY' 2>/dev/null || echo "false"
import sys
from framework.llm.model_catalog import model_supports_vision
print("true" if model_supports_vision(sys.argv[1]) else "false")
PY
)
fi
if [ -n "$SELECTED_PROVIDER_ID" ] && [ "$MAIN_MODEL_HAS_VISION" = "true" ]; then
# Drop any stale vision_fallback block so the config reflects the
# current main model's capabilities.
save_vision_fallback "" "" "" "" > /dev/null 2>&1 || true
echo -e "${GREEN}${NC} Vision fallback ${DIM}skipped — ${SELECTED_MODEL} already supports vision${NC}"
echo ""
elif [ -n "$SELECTED_PROVIDER_ID" ]; then
echo -e "${YELLOW}${NC} ${BLUE}${BOLD}Vision fallback subagent${NC}" echo -e "${YELLOW}${NC} ${BLUE}${BOLD}Vision fallback subagent${NC}"
echo "" echo ""
echo -e " ${DIM}When a screenshot/image tool is called from a text-only model,${NC}" echo -e " ${DIM}When a screenshot/image tool is called from a text-only model,${NC}"
@@ -1840,9 +1868,13 @@ if [ -n "$SELECTED_PROVIDER_ID" ]; then
# Build the candidate list from the same model_catalog.json the main # Build the candidate list from the same model_catalog.json the main
# LLM step uses — never hardcode model IDs in this script. For each # LLM step uses — never hardcode model IDs in this script. For each
# provider in the catalogue, take the catalogue's default model and # provider in the catalogue, pick a model whose ``supports_vision``
# the env var name it expects, then keep only providers the user # flag is true (since the fallback subagent's whole purpose is to
# already has an API key for. Output one TSV row per candidate: # caption images — a text-only candidate would be useless). Prefer
# the provider's default when it supports vision, otherwise fall
# back to the first vision-capable model in the provider's list.
# Skip the provider entirely if no model in its catalog supports
# vision. Output one TSV row per candidate:
# provider_id<TAB>model<TAB>env_var<TAB>display_name # provider_id<TAB>model<TAB>env_var<TAB>display_name
VISION_CANDIDATES_TSV=$(uv run python - <<'PY' VISION_CANDIDATES_TSV=$(uv run python - <<'PY'
import os import os
@@ -1879,9 +1911,25 @@ for provider_id, default_model in sorted(defaults.items()):
env = "GOOGLE_API_KEY" env = "GOOGLE_API_KEY"
if not has_key: if not has_key:
continue continue
# Pick a vision-capable model: prefer the catalog default if it has
# supports_vision=true, else the first vision-capable model in the
# provider's list. Skip the provider if none exist.
models = catalog.get(provider_id, [])
chosen = None
for m in models:
if m["id"] == default_model and m.get("supports_vision") is True:
chosen = m["id"]
break
if chosen is None:
for m in models:
if m.get("supports_vision") is True:
chosen = m["id"]
break
if chosen is None:
continue
# Display name: provider/model from the catalogue verbatim # Display name: provider/model from the catalogue verbatim
display = f"{provider_id}/{default_model}" display = f"{provider_id}/{chosen}"
print(f"{provider_id}\t{default_model}\t{env}\t{display}") print(f"{provider_id}\t{chosen}\t{env}\t{display}")
PY PY
) )