Files
hive/core/framework/llm/capabilities.py
T
2026-03-20 21:22:29 -07:00

107 lines
4.3 KiB
Python

"""Model capability checks for LLM providers.
Vision support rules are derived from official vendor documentation:
- ZAI (z.ai): docs.z.ai/guides/vlm — GLM-4.6V variants are vision; GLM-5/4.6/4.7 are text-only
- MiniMax: platform.minimax.io/docs — minimax-vl-01 is vision; M2.x are text-only
- DeepSeek: api-docs.deepseek.com — deepseek-vl2 is vision; chat/reasoner are text-only
- Cerebras: inference-docs.cerebras.ai — no vision models at all
- Groq: console.groq.com/docs/vision — vision capable; treat as supported by default
- Ollama/LM Studio/vLLM/llama.cpp: local runners denied by default; model names
don't reliably indicate vision support, so users must configure explicitly
"""
from __future__ import annotations
def _model_name(model: str) -> str:
"""Return the bare model name after stripping any 'provider/' prefix."""
if "/" in model:
return model.split("/", 1)[1]
return model
# Step 1: explicit vision allow-list — these always support images regardless
# of what the provider-level rules say. Checked first so that e.g. glm-4.6v
# is allowed even though glm-4.6 is denied.
_VISION_ALLOW_BARE_PREFIXES: tuple[str, ...] = (
# ZAI/GLM vision models (docs.z.ai/guides/vlm)
"glm-4v", # GLM-4V series (legacy)
"glm-4.6v", # GLM-4.6V, GLM-4.6V-flash, GLM-4.6V-flashx
# DeepSeek vision models
"deepseek-vl", # deepseek-vl2, deepseek-vl2-small, deepseek-vl2-tiny
# MiniMax vision model
"minimax-vl", # minimax-vl-01
)
# Step 2: provider-level deny — every model from this provider is text-only.
_TEXT_ONLY_PROVIDER_PREFIXES: tuple[str, ...] = (
# Cerebras: inference-docs.cerebras.ai lists only text models
"cerebras/",
# Local runners: model names don't reliably indicate vision support
"ollama/",
"ollama_chat/",
"lm_studio/",
"vllm/",
"llamacpp/",
)
# Step 3: per-model deny — text-only models within otherwise mixed providers.
# Matched against the bare model name (provider prefix stripped, lower-cased).
# The vision allow-list above is checked first, so vision variants of the same
# family are already handled before these deny patterns are reached.
_TEXT_ONLY_MODEL_BARE_PREFIXES: tuple[str, ...] = (
# --- ZAI / GLM family ---
# text-only: glm-5, glm-4.6, glm-4.7, glm-4.5, zai-glm-*
# vision: glm-4v, glm-4.6v (caught by allow-list above)
"glm-5",
"glm-4.6", # bare glm-4.6 is text-only; glm-4.6v is caught by allow-list
"glm-4.7",
"glm-4.5",
"zai-glm",
# --- DeepSeek ---
# text-only: deepseek-chat, deepseek-coder, deepseek-reasoner
# vision: deepseek-vl2 (caught by allow-list above)
# Note: LiteLLM's deepseek handler may flatten content lists for some models;
# VL models are allowed through and rely on LiteLLM's native VL support.
"deepseek-chat",
"deepseek-coder",
"deepseek-reasoner",
# --- MiniMax ---
# text-only: minimax-m2.*, minimax-text-*, abab* (legacy)
# vision: minimax-vl-01 (caught by allow-list above)
"minimax-m2",
"minimax-text",
"abab",
)
def supports_image_tool_results(model: str) -> bool:
"""Return whether *model* can receive image content in messages.
Used to gate both user-message images and tool-result image blocks.
Logic (checked in order):
1. Vision allow-list → True (known vision model, skip all denies)
2. Provider deny → False (entire provider is text-only)
3. Model deny → False (specific text-only model within a mixed provider)
4. Default → True (assume capable; unknown providers and models)
"""
model_lower = model.lower()
bare = _model_name(model_lower)
# 1. Explicit vision allow — takes priority over all denies
if any(bare.startswith(p) for p in _VISION_ALLOW_BARE_PREFIXES):
return True
# 2. Provider-level deny (all models from this provider are text-only)
if any(model_lower.startswith(p) for p in _TEXT_ONLY_PROVIDER_PREFIXES):
return False
# 3. Per-model deny (text-only variants within mixed-capability families)
if any(bare.startswith(p) for p in _TEXT_ONLY_MODEL_BARE_PREFIXES):
return False
# 5. Default: assume vision capable
# Covers: OpenAI, Anthropic, Google, Mistral, Kimi, and other hosted providers
return True