feat: model support

2026-04-24 20:17:41 -07:00
parent e7f9b7d791
commit 2ab5e6d784
7 changed files with 255 additions and 186 deletions
@@ -228,22 +228,16 @@ def _vision_fallback_active(model: str | None) -> bool:
    """Return True if tool-result images for *model* should be routed
    through the vision-fallback chain rather than sent to the model.
-    Trigger: the model appears in Hive's curated text-only deny list
+    Trigger: the model's catalog entry has ``supports_vision: false``
-    (``capabilities.supports_image_tool_results`` returns False).
+    (resolved via :func:`capabilities.supports_image_tool_results`,
-    That list is the only reliable signal — LiteLLM's
+    which reads ``model_catalog.json``). Unknown models default to
-    ``supports_vision`` returns False for any unknown model
+    vision-capable, so the fallback only fires when the catalog
-    (including custom-served vision-capable models like Jackrong/Qwopus3.5)
+    explicitly says the model is text-only.
    so it cannot be used as a gate; and LiteLLM's openai chat
    transformer doesn't strip image blocks anyway, so passing them
    through to a vision-capable but litellm-unrecognised model still
    works end-to-end.
    The ``vision_fallback`` config block is the *substitution* model —
-    it doesn't widen the trigger. To force fallback for a model the
+    it doesn't widen the trigger. To force fallback for a model that
-    deny list doesn't cover yet, add it to
+    isn't catalogued yet, add an entry to ``model_catalog.json`` with
-    ``capabilities._TEXT_ONLY_MODEL_BARE_PREFIXES`` /
+    ``supports_vision: false`` rather than relying on a runtime config.
    ``_TEXT_ONLY_PROVIDER_PREFIXES`` rather than relying on a runtime
    config.
    """
    if not model:
        return False
@@ -1,10 +1,10 @@
 """Vision-fallback subagent for tool-result images on text-only LLMs.
 When a tool returns image content but the main agent's model can't
-accept image blocks (per ``supports_image_tool_results``), the framework
+accept image blocks (i.e. its catalog entry has ``supports_vision: false``),
-strips the images before they ever reach the LLM. Without this module,
+the framework strips the images before they ever reach the LLM. Without
-the agent then sees only the tool's text envelope (URL, dimensions,
+this module, the agent then sees only the tool's text envelope (URL,
-size) and is blind to whatever the image actually shows.
+dimensions, size) and is blind to whatever the image actually shows.
 This module provides:
@@ -1,114 +1,32 @@
 """Model capability checks for LLM providers.
-Vision support rules are derived from official vendor documentation:
+Vision support is sourced from the curated ``model_catalog.json``. Each model
- ZAI (z.ai): docs.z.ai/guides/vlm — GLM-4.6V variants are vision; GLM-5/4.6/4.7 are text-only
+entry carries an optional ``supports_vision`` boolean; unknown models default
- MiniMax: platform.minimax.io/docs — minimax-vl-01 is vision; M2.x are text-only
+to vision-capable so hosted frontier models work out of the box. To toggle
- DeepSeek: api-docs.deepseek.com — deepseek-vl2 is vision; chat/reasoner are text-only
+support for a model, edit its catalog entry rather than this file.
 - Cerebras: inference-docs.cerebras.ai — no vision models at all
 - Groq: console.groq.com/docs/vision — vision capable; treat as supported by default
 - Ollama/LM Studio/vLLM/llama.cpp: local runners denied by default; model names
  don't reliably indicate vision support, so users must configure explicitly
 """
 from __future__ import annotations
 from typing import TYPE_CHECKING
 from framework.llm.model_catalog import model_supports_vision
 if TYPE_CHECKING:
    from framework.llm.provider import Tool
 def _model_name(model: str) -> str:
    """Return the bare model name after stripping any 'provider/' prefix."""
    if "/" in model:
        return model.split("/", 1)[1]
    return model
 # Step 1: explicit vision allow-list — these always support images regardless
 # of what the provider-level rules say.  Checked first so that e.g. glm-4.6v
 # is allowed even though glm-4.6 is denied.
 _VISION_ALLOW_BARE_PREFIXES: tuple[str, ...] = (
    # ZAI/GLM vision models (docs.z.ai/guides/vlm)
    "glm-4v",  # GLM-4V series (legacy)
    "glm-4.6v",  # GLM-4.6V, GLM-4.6V-flash, GLM-4.6V-flashx
    # DeepSeek vision models
    "deepseek-vl",  # deepseek-vl2, deepseek-vl2-small, deepseek-vl2-tiny
    # MiniMax vision model
    "minimax-vl",  # minimax-vl-01
 )
 # Step 2: provider-level deny — every model from this provider is text-only.
 _TEXT_ONLY_PROVIDER_PREFIXES: tuple[str, ...] = (
    # Cerebras: inference-docs.cerebras.ai lists only text models
    "cerebras/",
    # Local runners: model names don't reliably indicate vision support
    "ollama/",
    "ollama_chat/",
    "lm_studio/",
    "vllm/",
    "llamacpp/",
 )
 # Step 3: per-model deny — text-only models within otherwise mixed providers.
 # Matched against the bare model name (provider prefix stripped, lower-cased).
 # The vision allow-list above is checked first, so vision variants of the same
 # family are already handled before these deny patterns are reached.
 _TEXT_ONLY_MODEL_BARE_PREFIXES: tuple[str, ...] = (
    # --- ZAI / GLM family ---
    # text-only: glm-5, glm-4.6, glm-4.7, glm-4.5, zai-glm-*
    # vision:    glm-4v, glm-4.6v (caught by allow-list above)
    "glm-5",
    "glm-4.6",  # bare glm-4.6 is text-only; glm-4.6v is caught by allow-list
    "glm-4.7",
    "glm-4.5",
    "zai-glm",
    # --- DeepSeek ---
    # text-only: deepseek-chat, deepseek-coder, deepseek-reasoner
    # vision:    deepseek-vl2 (caught by allow-list above)
    # Note: LiteLLM's deepseek handler may flatten content lists for some models;
    # VL models are allowed through and rely on LiteLLM's native VL support.
    "deepseek-chat",
    "deepseek-coder",
    "deepseek-reasoner",
    # --- MiniMax ---
    # text-only: minimax-m2.*, minimax-text-*, abab* (legacy)
    # vision:    minimax-vl-01 (caught by allow-list above)
    "minimax-m2",
    "minimax-text",
    "abab",
 )
 def supports_image_tool_results(model: str) -> bool:
    """Return whether *model* can receive image content in messages.
-    Used to gate both user-message images and tool-result image blocks.
+    Thin wrapper over :func:`model_supports_vision` so existing call sites
-
+    keep working. Used to gate both user-message images and tool-result
-    Logic (checked in order):
+    image blocks. Empty model strings are treated as capable so the default
-    1. Vision allow-list  → True  (known vision model, skip all denies)
+    code path doesn't strip images before a provider is selected.
    2. Provider deny      → False (entire provider is text-only)
    3. Model deny         → False (specific text-only model within a mixed provider)
    4. Default            → True  (assume capable; unknown providers and models)
    """
-    model_lower = model.lower()
+    if not model:
    bare = _model_name(model_lower)
    # 1. Explicit vision allow — takes priority over all denies
    if any(bare.startswith(p) for p in _VISION_ALLOW_BARE_PREFIXES):
        return True
-
+    return model_supports_vision(model)
    # 2. Provider-level deny (all models from this provider are text-only)
    if any(model_lower.startswith(p) for p in _TEXT_ONLY_PROVIDER_PREFIXES):
        return False
    # 3. Per-model deny (text-only variants within mixed-capability families)
    if any(bare.startswith(p) for p in _TEXT_ONLY_MODEL_BARE_PREFIXES):
        return False
    # 5. Default: assume vision capable
    #    Covers: OpenAI, Anthropic, Google, Mistral, Kimi, and other hosted providers
    return True
 def filter_tools_for_model(tools: list[Tool], model: str) -> tuple[list[Tool], list[str]]:
@@ -9,47 +9,65 @@
          "label": "Haiku 4.5 - Fast + cheap",
          "recommended": false,
          "max_tokens": 64000,
-          "max_context_tokens": 136000
+          "max_context_tokens": 136000,
          "supports_vision": true
        },
        {
          "id": "claude-sonnet-4-5-20250929",
          "label": "Sonnet 4.5 - Best balance",
          "recommended": false,
          "max_tokens": 64000,
-          "max_context_tokens": 136000
+          "max_context_tokens": 136000,
          "supports_vision": true
        },
        {
          "id": "claude-opus-4-6",
          "label": "Opus 4.6 - Most capable",
          "recommended": true,
          "max_tokens": 128000,
-          "max_context_tokens": 872000
+          "max_context_tokens": 872000,
          "supports_vision": true
        }
      ]
    },
    "openai": {
-      "default_model": "gpt-5.4",
+      "default_model": "gpt-5.5",
      "models": [
        {
-          "id": "gpt-5.4",
+          "id": "gpt-5.5",
-          "label": "GPT-5.4 - Best intelligence",
+          "label": "GPT-5.5 - Frontier coding + reasoning",
          "recommended": true,
          "max_tokens": 128000,
-          "max_context_tokens": 960000
+          "max_context_tokens": 1050000,
          "pricing_usd_per_mtok": {
            "input": 5.00,
            "output": 30.00
          },
          "supports_vision": true
        },
        {
          "id": "gpt-5.4",
          "label": "GPT-5.4 - Previous flagship",
          "recommended": false,
          "max_tokens": 128000,
          "max_context_tokens": 960000,
          "supports_vision": true
        },
        {
          "id": "gpt-5.4-mini",
          "label": "GPT-5.4 Mini - Faster + cheaper",
          "recommended": false,
          "max_tokens": 128000,
-          "max_context_tokens": 400000
+          "max_context_tokens": 400000,
          "supports_vision": true
        },
        {
          "id": "gpt-5.4-nano",
          "label": "GPT-5.4 Nano - Cheapest high-volume",
          "recommended": false,
          "max_tokens": 128000,
-          "max_context_tokens": 400000
+          "max_context_tokens": 400000,
          "supports_vision": true
        }
      ]
    },
@@ -61,14 +79,16 @@
          "label": "Gemini 3 Flash - Fast",
          "recommended": false,
          "max_tokens": 32768,
-          "max_context_tokens": 240000
+          "max_context_tokens": 240000,
          "supports_vision": true
        },
        {
          "id": "gemini-3.1-pro-preview-customtools",
          "label": "Gemini 3.1 Pro - Best quality",
          "recommended": true,
          "max_tokens": 32768,
-          "max_context_tokens": 240000
+          "max_context_tokens": 240000,
          "supports_vision": true
        }
      ]
    },
@@ -80,28 +100,32 @@
          "label": "GPT-OSS 120B - Best reasoning",
          "recommended": true,
          "max_tokens": 65536,
-          "max_context_tokens": 131072
+          "max_context_tokens": 131072,
          "supports_vision": false
        },
        {
          "id": "openai/gpt-oss-20b",
          "label": "GPT-OSS 20B - Fast + cheaper",
          "recommended": false,
          "max_tokens": 65536,
-          "max_context_tokens": 131072
+          "max_context_tokens": 131072,
          "supports_vision": false
        },
        {
          "id": "llama-3.3-70b-versatile",
          "label": "Llama 3.3 70B - General purpose",
          "recommended": false,
          "max_tokens": 32768,
-          "max_context_tokens": 131072
+          "max_context_tokens": 131072,
          "supports_vision": false
        },
        {
          "id": "llama-3.1-8b-instant",
          "label": "Llama 3.1 8B - Fastest",
          "recommended": false,
          "max_tokens": 131072,
-          "max_context_tokens": 131072
+          "max_context_tokens": 131072,
          "supports_vision": false
        }
      ]
    },
@@ -113,21 +137,24 @@
          "label": "GPT-OSS 120B - Best production reasoning",
          "recommended": true,
          "max_tokens": 40960,
-          "max_context_tokens": 131072
+          "max_context_tokens": 131072,
          "supports_vision": false
        },
        {
          "id": "zai-glm-4.7",
          "label": "Z.ai GLM 4.7 - Strong coding preview",
          "recommended": true,
          "max_tokens": 40960,
-          "max_context_tokens": 131072
+          "max_context_tokens": 131072,
          "supports_vision": false
        },
        {
          "id": "qwen-3-235b-a22b-instruct-2507",
          "label": "Qwen 3 235B Instruct - Frontier preview",
          "recommended": false,
          "max_tokens": 40960,
-          "max_context_tokens": 131072
+          "max_context_tokens": 131072,
          "supports_vision": false
        }
      ]
    },
@@ -143,14 +170,16 @@
          "pricing_usd_per_mtok": {
            "input": 0.30,
            "output": 1.20
-          }
+          },
          "supports_vision": false
        },
        {
          "id": "MiniMax-M2.5",
          "label": "MiniMax M2.5 - Strong value",
          "recommended": false,
          "max_tokens": 40960,
-          "max_context_tokens": 180000
+          "max_context_tokens": 180000,
          "supports_vision": false
        }
      ]
    },
@@ -162,28 +191,32 @@
          "label": "Mistral Large 3 - Best quality",
          "recommended": true,
          "max_tokens": 32768,
-          "max_context_tokens": 256000
+          "max_context_tokens": 256000,
          "supports_vision": true
        },
        {
          "id": "mistral-medium-2508",
          "label": "Mistral Medium 3.1 - Balanced",
          "recommended": false,
          "max_tokens": 32768,
-          "max_context_tokens": 128000
+          "max_context_tokens": 128000,
          "supports_vision": true
        },
        {
          "id": "mistral-small-2603",
          "label": "Mistral Small 4 - Fast + capable",
          "recommended": false,
          "max_tokens": 32768,
-          "max_context_tokens": 256000
+          "max_context_tokens": 256000,
          "supports_vision": true
        },
        {
          "id": "codestral-2508",
          "label": "Codestral - Coding specialist",
          "recommended": false,
          "max_tokens": 32768,
-          "max_context_tokens": 128000
+          "max_context_tokens": 128000,
          "supports_vision": false
        }
      ]
    },
@@ -195,47 +228,71 @@
          "label": "DeepSeek V3.1 - Best general coding",
          "recommended": true,
          "max_tokens": 32768,
-          "max_context_tokens": 128000
+          "max_context_tokens": 128000,
          "supports_vision": false
        },
        {
          "id": "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8",
          "label": "Qwen3 Coder 480B - Advanced coding",
          "recommended": false,
          "max_tokens": 32768,
-          "max_context_tokens": 262144
+          "max_context_tokens": 262144,
          "supports_vision": false
        },
        {
          "id": "openai/gpt-oss-120b",
          "label": "GPT-OSS 120B - Strong reasoning",
          "recommended": false,
          "max_tokens": 32768,
-          "max_context_tokens": 128000
+          "max_context_tokens": 128000,
          "supports_vision": false
        },
        {
          "id": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
          "label": "Llama 3.3 70B Turbo - Fast baseline",
          "recommended": false,
          "max_tokens": 32768,
-          "max_context_tokens": 131072
+          "max_context_tokens": 131072,
          "supports_vision": false
        }
      ]
    },
    "deepseek": {
-      "default_model": "deepseek-chat",
+      "default_model": "deepseek-v4-pro",
      "models": [
        {
-          "id": "deepseek-chat",
+          "id": "deepseek-v4-pro",
-          "label": "DeepSeek Chat - Fast default",
+          "label": "DeepSeek V4 Pro - Most capable",
          "recommended": true,
-          "max_tokens": 8192,
+          "max_tokens": 384000,
-          "max_context_tokens": 128000
+          "max_context_tokens": 1000000,
          "pricing_usd_per_mtok": {
            "input": 1.74,
            "output": 3.48,
            "cache_read": 0.145
          },
          "supports_vision": false
        },
        {
          "id": "deepseek-v4-flash",
          "label": "DeepSeek V4 Flash - Fast + cheap",
          "recommended": true,
          "max_tokens": 384000,
          "max_context_tokens": 1000000,
          "pricing_usd_per_mtok": {
            "input": 0.14,
            "output": 0.28,
            "cache_read": 0.028
          },
          "supports_vision": false
        },
        {
          "id": "deepseek-reasoner",
-          "label": "DeepSeek Reasoner - Deep thinking",
+          "label": "DeepSeek Reasoner - Legacy (deprecating)",
          "recommended": false,
          "max_tokens": 64000,
-          "max_context_tokens": 128000
+          "max_context_tokens": 128000,
          "supports_vision": false
        }
      ]
    },
@@ -252,7 +309,8 @@
            "input": 0.60,
            "output": 2.50,
            "cache_read": 0.15
-          }
+          },
          "supports_vision": true
        }
      ]
    },
@@ -264,14 +322,16 @@
          "label": "Queen - Hive native",
          "recommended": true,
          "max_tokens": 32768,
-          "max_context_tokens": 180000
+          "max_context_tokens": 180000,
          "supports_vision": false
        },
        {
          "id": "kimi-2.5",
          "label": "Kimi 2.5 - Via Hive",
          "recommended": false,
          "max_tokens": 32768,
-          "max_context_tokens": 240000
+          "max_context_tokens": 240000,
          "supports_vision": true
        },
        {
          "id": "glm-5.1",
@@ -284,7 +344,8 @@
            "output": 4.40,
            "cache_read": 0.26,
            "cache_creation": 0.0
-          }
+          },
          "supports_vision": false
        }
      ]
    },
@@ -296,42 +357,48 @@
          "label": "GPT-5.4 - Best overall",
          "recommended": true,
          "max_tokens": 128000,
-          "max_context_tokens": 872000
+          "max_context_tokens": 872000,
          "supports_vision": true
        },
        {
          "id": "anthropic/claude-sonnet-4.6",
          "label": "Claude Sonnet 4.6 - Best coding balance",
          "recommended": false,
          "max_tokens": 64000,
-          "max_context_tokens": 872000
+          "max_context_tokens": 872000,
          "supports_vision": true
        },
        {
          "id": "anthropic/claude-opus-4.6",
          "label": "Claude Opus 4.6 - Most capable",
          "recommended": false,
          "max_tokens": 128000,
-          "max_context_tokens": 872000
+          "max_context_tokens": 872000,
          "supports_vision": true
        },
        {
          "id": "google/gemini-3.1-pro-preview-customtools",
          "label": "Gemini 3.1 Pro Preview - Long-context reasoning",
          "recommended": false,
          "max_tokens": 32768,
-          "max_context_tokens": 872000
+          "max_context_tokens": 872000,
          "supports_vision": true
        },
        {
          "id": "qwen/qwen3.6-plus",
          "label": "Qwen 3.6 Plus - Strong reasoning",
          "recommended": true,
          "max_tokens": 32768,
-          "max_context_tokens": 240000
+          "max_context_tokens": 240000,
          "supports_vision": false
        },
        {
          "id": "z-ai/glm-5v-turbo",
          "label": "GLM-5V Turbo - Vision capable",
          "recommended": true,
          "max_tokens": 32768,
-          "max_context_tokens": 192000
+          "max_context_tokens": 192000,
          "supports_vision": true
        },
        {
          "id": "z-ai/glm-5.1",
@@ -344,7 +411,8 @@
            "output": 4.40,
            "cache_read": 0.26,
            "cache_creation": 0.0
-          }
+          },
          "supports_vision": false
        },
        {
          "id": "minimax/minimax-m2.7",
@@ -355,14 +423,16 @@
          "pricing_usd_per_mtok": {
            "input": 0.30,
            "output": 1.20
-          }
+          },
          "supports_vision": false
        },
        {
          "id": "xiaomi/mimo-v2-pro",
          "label": "MiMo V2 Pro - Xiaomi multimodal",
          "recommended": true,
          "max_tokens": 64000,
-          "max_context_tokens": 240000
+          "max_context_tokens": 240000,
          "supports_vision": true
        }
      ]
    }
@@ -95,6 +95,10 @@ def _validate_model_catalog(data: dict[str, Any]) -> dict[str, Any]:
            if pricing is not None:
                _validate_pricing(pricing, f"{model_path}.pricing_usd_per_mtok")
            supports_vision = model_map.get("supports_vision")
            if supports_vision is not None and not isinstance(supports_vision, bool):
                raise ModelCatalogError(f"{model_path}.supports_vision must be a boolean when present")
        if not default_found:
            raise ModelCatalogError(
                f"{provider_path}.default_model={default_model!r} is not present in {provider_path}.models"
@@ -229,6 +233,34 @@ def get_model_pricing(model_id: str) -> dict[str, float] | None:
    return None
 def model_supports_vision(model_id: str) -> bool:
    """Return whether *model_id* supports image inputs per the curated catalog.
    Looks up the bare model id (and the provider-prefix-stripped form) in the
    catalog. Returns the model's ``supports_vision`` flag when found, defaulting
    to ``True`` for unknown models or when the flag is absent — assume vision
    capable for hosted providers, since modern frontier models support images
    by default and the captioning fallback is more expensive than just letting
    the provider handle the image.
    """
    if not model_id:
        return True
    candidates = [model_id]
    if "/" in model_id:
        candidates.append(model_id.split("/", 1)[1])
    for candidate in candidates:
        for provider_info in load_model_catalog()["providers"].values():
            for model in provider_info["models"]:
                if model["id"] == candidate:
                    flag = model.get("supports_vision")
                    if isinstance(flag, bool):
                        return flag
                    return True
    return True
 def get_preset(preset_id: str) -> dict[str, Any] | None:
    """Return one preset entry."""
    preset = load_model_catalog()["presets"].get(preset_id)
@@ -24,12 +24,12 @@ def test_default_models_exist_in_each_provider_catalogue():
 def test_find_model_returns_curated_token_limits():
-    model = model_catalog.find_model("openai", "gpt-5.4")
+    model = model_catalog.find_model("openai", "gpt-5.5")
    assert model is not None
-    assert model["label"] == "GPT-5.4 - Best intelligence"
+    assert model["label"] == "GPT-5.5 - Frontier coding + reasoning"
    assert model["max_tokens"] == 128000
-    assert model["max_context_tokens"] == 960000
+    assert model["max_context_tokens"] == 1050000
 def test_anthropic_curated_limits_track_documented_caps_with_safe_input_budget():
@@ -125,15 +125,22 @@ def test_deepseek_catalog_tracks_current_api_models():
    deepseek_default = model_catalog.get_default_models()["deepseek"]
    deepseek_models = model_catalog.get_models_catalogue()["deepseek"]
-    assert deepseek_default == "deepseek-chat"
+    assert deepseek_default == "deepseek-v4-pro"
    assert [model["id"] for model in deepseek_models] == [
-        "deepseek-chat",
+        "deepseek-v4-pro",
        "deepseek-v4-flash",
        "deepseek-reasoner",
    ]
-    assert deepseek_models[0]["max_tokens"] == 8192
+    # V4 family — 1M context, 384k max output, mirrors api-docs.deepseek.com pricing.
-    assert deepseek_models[0]["max_context_tokens"] == 128000
+    assert deepseek_models[0]["max_tokens"] == 384000
-    assert deepseek_models[1]["max_tokens"] == 64000
+    assert deepseek_models[0]["max_context_tokens"] == 1000000
-    assert deepseek_models[1]["max_context_tokens"] == 128000
+    assert deepseek_models[0]["pricing_usd_per_mtok"]["input"] == 1.74
    assert deepseek_models[0]["pricing_usd_per_mtok"]["output"] == 3.48
    assert deepseek_models[1]["pricing_usd_per_mtok"]["input"] == 0.14
    assert deepseek_models[1]["pricing_usd_per_mtok"]["output"] == 0.28
    # Legacy reasoner kept for back-compat while users migrate.
    assert deepseek_models[2]["max_tokens"] == 64000
    assert deepseek_models[2]["max_context_tokens"] == 128000
 def test_openrouter_catalog_tracks_current_frontier_set():
@@ -1352,9 +1352,11 @@ fi
 echo ""
 echo -e "  ${CYAN}${BOLD}API key providers:${NC}"
-# 8-13) API key providers — show (credential detected) if key already set
+# 8-N) API key providers — show (credential detected) if key already set.
-PROVIDER_MENU_ENVS=(ANTHROPIC_API_KEY OPENAI_API_KEY GEMINI_API_KEY GROQ_API_KEY CEREBRAS_API_KEY OPENROUTER_API_KEY)
+# Order is reflected directly in the menu numbering; the case dispatcher
-PROVIDER_MENU_NAMES=("Anthropic (Claude) - Recommended" "OpenAI (GPT)" "Google Gemini - Free tier available" "Groq - Fast, free tier" "Cerebras - Fast, free tier" "OpenRouter - Bring any OpenRouter model")
+# below resolves choice numbers via $((8 + index_in_arrays)).
 PROVIDER_MENU_ENVS=(ANTHROPIC_API_KEY OPENAI_API_KEY GEMINI_API_KEY GROQ_API_KEY CEREBRAS_API_KEY OPENROUTER_API_KEY DEEPSEEK_API_KEY)
 PROVIDER_MENU_NAMES=("Anthropic (Claude) - Recommended" "OpenAI (GPT)" "Google Gemini - Free tier available" "Groq - Fast, free tier" "Cerebras - Fast, free tier" "OpenRouter - Bring any OpenRouter model" "DeepSeek - V4 family")
 for idx in "${!PROVIDER_MENU_ENVS[@]}"; do
    num=$((idx + 8))
    env_var="${PROVIDER_MENU_ENVS[$idx]}"
@@ -1365,14 +1367,16 @@ for idx in "${!PROVIDER_MENU_ENVS[@]}"; do
    fi
 done
-# 14) Local (Ollama) — no API key needed
+# Local (Ollama) — slot computed from the provider list so adding/removing
 # API-key providers above doesn't require renumbering by hand.
 OLLAMA_CHOICE=$((8 + ${#PROVIDER_MENU_ENVS[@]}))
 if [ "$OLLAMA_DETECTED" = true ]; then
-    echo -e "  ${CYAN}14)${NC} Local (Ollama) - No API key needed  ${GREEN}(ollama detected)${NC}"
+    echo -e "  ${CYAN}$OLLAMA_CHOICE)${NC} Local (Ollama) - No API key needed  ${GREEN}(ollama detected)${NC}"
 else
-    echo -e "  ${CYAN}14)${NC} Local (Ollama) - No API key needed"
+    echo -e "  ${CYAN}$OLLAMA_CHOICE)${NC} Local (Ollama) - No API key needed"
 fi
-SKIP_CHOICE=$((8 + ${#PROVIDER_MENU_ENVS[@]} + 1))
+SKIP_CHOICE=$((OLLAMA_CHOICE + 1))
 echo -e "  ${CYAN}$SKIP_CHOICE)${NC} Skip for now"
 echo ""
@@ -1578,6 +1582,13 @@ case $choice in
        SIGNUP_URL="https://openrouter.ai/keys"
        ;;
    14)
        SELECTED_ENV_VAR="DEEPSEEK_API_KEY"
        SELECTED_PROVIDER_ID="deepseek"
        SELECTED_API_BASE="https://api.deepseek.com"
        PROVIDER_NAME="DeepSeek"
        SIGNUP_URL="https://platform.deepseek.com/api_keys"
        ;;
    "$OLLAMA_CHOICE")
        # Local (Ollama) — no API key; pick model from ollama list
        if [ "$OLLAMA_DETECTED" != true ]; then
            echo ""
@@ -1824,12 +1835,29 @@ echo ""
 # image through a separate VLM subagent that returns a text caption,
 # preserving the agent's ability to reason about visual state.
 #
-# We always offer the prompt — even for vision-capable main models —
+# Skip entirely when the chosen main model already supports vision per
-# so the user gets a working fallback if they ever swap to a text-only
+# the catalog's ``supports_vision`` flag — the fallback would never fire
-# model. The block is dormant for vision-capable mains (the gating
+# in that case, and prompting for it just adds friction. For text-only
-# in agent_loop only fires for models on Hive's deny list).
+# mains we still offer the prompt so the user can wire up a captioning
 # subagent.
-if [ -n "$SELECTED_PROVIDER_ID" ]; then
+MAIN_MODEL_HAS_VISION="false"
 if [ -n "$SELECTED_MODEL" ]; then
    MAIN_MODEL_HAS_VISION=$(uv run python - "$SELECTED_MODEL" <<'PY' 2>/dev/null || echo "false"
 import sys
 from framework.llm.model_catalog import model_supports_vision
 print("true" if model_supports_vision(sys.argv[1]) else "false")
 PY
 )
 fi
 if [ -n "$SELECTED_PROVIDER_ID" ] && [ "$MAIN_MODEL_HAS_VISION" = "true" ]; then
    # Drop any stale vision_fallback block so the config reflects the
    # current main model's capabilities.
    save_vision_fallback "" "" "" "" > /dev/null 2>&1 || true
    echo -e "${GREEN}⬢${NC} Vision fallback ${DIM}skipped — ${SELECTED_MODEL} already supports vision${NC}"
    echo ""
 elif [ -n "$SELECTED_PROVIDER_ID" ]; then
    echo -e "${YELLOW}⬢${NC} ${BLUE}${BOLD}Vision fallback subagent${NC}"
    echo ""
    echo -e "  ${DIM}When a screenshot/image tool is called from a text-only model,${NC}"
@@ -1840,9 +1868,13 @@ if [ -n "$SELECTED_PROVIDER_ID" ]; then
    # Build the candidate list from the same model_catalog.json the main
    # LLM step uses — never hardcode model IDs in this script. For each
-    # provider in the catalogue, take the catalogue's default model and
+    # provider in the catalogue, pick a model whose ``supports_vision``
-    # the env var name it expects, then keep only providers the user
+    # flag is true (since the fallback subagent's whole purpose is to
-    # already has an API key for. Output one TSV row per candidate:
+    # caption images — a text-only candidate would be useless). Prefer
    # the provider's default when it supports vision, otherwise fall
    # back to the first vision-capable model in the provider's list.
    # Skip the provider entirely if no model in its catalog supports
    # vision. Output one TSV row per candidate:
    # provider_id<TAB>model<TAB>env_var<TAB>display_name
    VISION_CANDIDATES_TSV=$(uv run python - <<'PY'
 import os
@@ -1879,9 +1911,25 @@ for provider_id, default_model in sorted(defaults.items()):
            env = "GOOGLE_API_KEY"
    if not has_key:
        continue
    # Pick a vision-capable model: prefer the catalog default if it has
    # supports_vision=true, else the first vision-capable model in the
    # provider's list. Skip the provider if none exist.
    models = catalog.get(provider_id, [])
    chosen = None
    for m in models:
        if m["id"] == default_model and m.get("supports_vision") is True:
            chosen = m["id"]
            break
    if chosen is None:
        for m in models:
            if m.get("supports_vision") is True:
                chosen = m["id"]
                break
    if chosen is None:
        continue
    # Display name: provider/model from the catalogue verbatim
-    display = f"{provider_id}/{default_model}"
+    display = f"{provider_id}/{chosen}"
-    print(f"{provider_id}\t{default_model}\t{env}\t{display}")
+    print(f"{provider_id}\t{chosen}\t{env}\t{display}")
 PY
 )