From 77cc1696068cc7107f9d5ad05901380a4feae23e Mon Sep 17 00:00:00 2001
From: Richard Tang <richard@adenhq.com>
Date: Thu, 23 Apr 2026 15:34:07 -0700
Subject: [PATCH] feat: cost tracking

---
 core/framework/agent_loop/agent_loop.py       |   8 +-
 .../agent_loop/internals/event_publishing.py  |   2 +
 core/framework/host/event_bus.py              |   5 +
 core/framework/llm/litellm.py                 | 100 ++++++++++++++
 core/framework/llm/provider.py                |   6 +
 core/framework/llm/stream_events.py           |   4 +
 core/frontend/src/components/ChatPanel.tsx    |  36 +++--
 core/frontend/src/pages/queen-dm.tsx          |   7 +-
 core/tests/test_event_bus.py                  |   2 +
 core/tests/test_litellm_provider.py           | 123 ++++++++++++++++++
 core/tests/test_stream_events.py              |   1 +
 11 files changed, 283 insertions(+), 11 deletions(-)

diff --git a/core/framework/agent_loop/agent_loop.py b/core/framework/agent_loop/agent_loop.py
index 7ad47619..9e7ab2c3 100644
--- a/core/framework/agent_loop/agent_loop.py
+++ b/core/framework/agent_loop/agent_loop.py
@@ -940,6 +940,7 @@ class AgentLoop(AgentProtocol):
                         output_tokens=turn_tokens.get("output", 0),
                         cached_tokens=turn_tokens.get("cached", 0),
                         cache_creation_tokens=turn_tokens.get("cache_creation", 0),
+                        cost_usd=float(turn_tokens.get("cost", 0.0) or 0.0),
                         execution_id=execution_id,
                         iteration=iteration,
                     )
@@ -2340,7 +2341,9 @@ class AgentLoop(AgentProtocol):
         stream_id = ctx.stream_id or ctx.agent_id
         node_id = ctx.agent_id
         execution_id = ctx.execution_id or ""
-        token_counts: dict[str, int] = {"input": 0, "output": 0, "cached": 0, "cache_creation": 0}
+        # Mixed-type dict: int token counts + str stop_reason/model + float cost.
+        # Typed loosely to avoid churn in the many call sites that read from it.
+        token_counts: dict[str, Any] = {"input": 0, "output": 0, "cached": 0, "cache_creation": 0, "cost": 0.0}
         tool_call_count = 0
         final_text = ""
         final_system_prompt = conversation.system_prompt
@@ -2572,6 +2575,7 @@ class AgentLoop(AgentProtocol):
                         token_counts["output"] += event.output_tokens
                         token_counts["cached"] += event.cached_tokens
                         token_counts["cache_creation"] += event.cache_creation_tokens
+                        token_counts["cost"] = token_counts.get("cost", 0.0) + event.cost_usd
                         token_counts["stop_reason"] = event.stop_reason
                         token_counts["model"] = event.model
 
@@ -4154,6 +4158,7 @@ class AgentLoop(AgentProtocol):
         output_tokens: int,
         cached_tokens: int = 0,
         cache_creation_tokens: int = 0,
+        cost_usd: float = 0.0,
         execution_id: str = "",
         iteration: int | None = None,
     ) -> None:
@@ -4167,6 +4172,7 @@ class AgentLoop(AgentProtocol):
             output_tokens=output_tokens,
             cached_tokens=cached_tokens,
             cache_creation_tokens=cache_creation_tokens,
+            cost_usd=cost_usd,
             execution_id=execution_id,
             iteration=iteration,
         )
diff --git a/core/framework/agent_loop/internals/event_publishing.py b/core/framework/agent_loop/internals/event_publishing.py
index cac4f9e4..87ba2a69 100644
--- a/core/framework/agent_loop/internals/event_publishing.py
+++ b/core/framework/agent_loop/internals/event_publishing.py
@@ -109,6 +109,7 @@ async def publish_llm_turn_complete(
     output_tokens: int,
     cached_tokens: int = 0,
     cache_creation_tokens: int = 0,
+    cost_usd: float = 0.0,
     execution_id: str = "",
     iteration: int | None = None,
 ) -> None:
@@ -122,6 +123,7 @@ async def publish_llm_turn_complete(
             output_tokens=output_tokens,
             cached_tokens=cached_tokens,
             cache_creation_tokens=cache_creation_tokens,
+            cost_usd=cost_usd,
             execution_id=execution_id,
             iteration=iteration,
         )
diff --git a/core/framework/host/event_bus.py b/core/framework/host/event_bus.py
index c72fc679..9c17dad4 100644
--- a/core/framework/host/event_bus.py
+++ b/core/framework/host/event_bus.py
@@ -810,6 +810,7 @@ class EventBus:
         output_tokens: int,
         cached_tokens: int = 0,
         cache_creation_tokens: int = 0,
+        cost_usd: float = 0.0,
         execution_id: str | None = None,
         iteration: int | None = None,
     ) -> None:
@@ -818,6 +819,9 @@ class EventBus:
         ``cached_tokens`` and ``cache_creation_tokens`` are subsets of
         ``input_tokens`` (already inside provider ``prompt_tokens``).
         Subscribers should display them, not add them to a total.
+
+        ``cost_usd`` is the USD cost for this turn when known (Anthropic,
+        OpenAI, OpenRouter). 0.0 means unreported (not free).
         """
         data: dict = {
             "stop_reason": stop_reason,
@@ -826,6 +830,7 @@ class EventBus:
             "output_tokens": output_tokens,
             "cached_tokens": cached_tokens,
             "cache_creation_tokens": cache_creation_tokens,
+            "cost_usd": cost_usd,
         }
         if iteration is not None:
             data["iteration"] = iteration
diff --git a/core/framework/llm/litellm.py b/core/framework/llm/litellm.py
index 7171fcac..16681e70 100644
--- a/core/framework/llm/litellm.py
+++ b/core/framework/llm/litellm.py
@@ -360,6 +360,82 @@ FAILED_REQUESTS_DIR = Path.home() / ".hive" / "failed_requests"
 MAX_FAILED_REQUEST_DUMPS = 50
 
 
+def _extract_cost(response: Any, model: str) -> float:
+    """Pull the USD cost for a non-streaming completion response.
+
+    Sources checked, in priority order:
+      1. ``usage.cost`` — populated when OpenRouter returns native cost via
+         ``usage: {include: true}`` or when ``litellm.include_cost_in_streaming_usage``
+         is on.
+      2. ``response._hidden_params["response_cost"]`` — set by LiteLLM's
+         logging layer after most successful completions.
+      3. ``litellm.completion_cost(...)`` — computes from the model pricing
+         table; works across Anthropic, OpenAI, and OpenRouter as long as the
+         model is in LiteLLM's catalog.
+
+    Returns 0.0 for unpriced models or unexpected response shapes — cost is a
+    display concern, never let it break the hot path. For streaming paths
+    where the aggregate response isn't a full ``ModelResponse``, use
+    :func:`_cost_from_tokens` with the already-extracted token counts.
+    """
+    if response is None:
+        return 0.0
+    usage = getattr(response, "usage", None)
+    usage_cost = getattr(usage, "cost", None) if usage is not None else None
+    if isinstance(usage_cost, (int, float)) and usage_cost > 0:
+        return float(usage_cost)
+
+    hidden = getattr(response, "_hidden_params", None)
+    if isinstance(hidden, dict):
+        hp_cost = hidden.get("response_cost")
+        if isinstance(hp_cost, (int, float)) and hp_cost > 0:
+            return float(hp_cost)
+
+    try:
+        import litellm as _litellm
+
+        computed = _litellm.completion_cost(completion_response=response, model=model)
+        if isinstance(computed, (int, float)) and computed > 0:
+            return float(computed)
+    except Exception as exc:
+        logger.debug("[cost] completion_cost failed for %s: %s", model, exc)
+    return 0.0
+
+
+def _cost_from_tokens(
+    model: str,
+    input_tokens: int,
+    output_tokens: int,
+    cached_tokens: int = 0,
+    cache_creation_tokens: int = 0,
+) -> float:
+    """Compute USD cost from already-normalized token counts.
+
+    Used on streaming paths where the aggregate ``response`` is the stream
+    wrapper (not a full ``ModelResponse``) and ``litellm.completion_cost`` on
+    it either no-ops or raises. Calls ``litellm.cost_per_token`` directly
+    with the cache-aware inputs so Anthropic's 5-min-write / cache-read
+    multipliers are applied correctly.
+    """
+    if not model or (input_tokens == 0 and output_tokens == 0):
+        return 0.0
+    try:
+        import litellm as _litellm
+
+        prompt_cost, completion_cost = _litellm.cost_per_token(
+            model=model,
+            prompt_tokens=input_tokens,
+            completion_tokens=output_tokens,
+            cache_read_input_tokens=cached_tokens,
+            cache_creation_input_tokens=cache_creation_tokens,
+        )
+        total = (prompt_cost or 0.0) + (completion_cost or 0.0)
+        return float(total) if total > 0 else 0.0
+    except Exception as exc:
+        logger.debug("[cost] cost_per_token failed for %s: %s", model, exc)
+        return 0.0
+
+
 def _extract_cache_tokens(usage: Any) -> tuple[int, int]:
     """Pull (cache_read, cache_creation) from a LiteLLM usage object.
 
@@ -1115,6 +1191,7 @@ class LiteLLMProvider(LLMProvider):
         input_tokens = usage.prompt_tokens if usage else 0
         output_tokens = usage.completion_tokens if usage else 0
         cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
+        cost_usd = _extract_cost(response, self.model)
 
         return LLMResponse(
             content=content,
@@ -1123,6 +1200,7 @@ class LiteLLMProvider(LLMProvider):
             output_tokens=output_tokens,
             cached_tokens=cached_tokens,
             cache_creation_tokens=cache_creation_tokens,
+            cost_usd=cost_usd,
             stop_reason=response.choices[0].finish_reason or "",
             raw_response=response,
         )
@@ -1338,6 +1416,7 @@ class LiteLLMProvider(LLMProvider):
         input_tokens = usage.prompt_tokens if usage else 0
         output_tokens = usage.completion_tokens if usage else 0
         cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
+        cost_usd = _extract_cost(response, self.model)
 
         return LLMResponse(
             content=content,
@@ -1346,6 +1425,7 @@ class LiteLLMProvider(LLMProvider):
             output_tokens=output_tokens,
             cached_tokens=cached_tokens,
             cache_creation_tokens=cache_creation_tokens,
+            cost_usd=cost_usd,
             stop_reason=response.choices[0].finish_reason or "",
             raw_response=response,
         )
@@ -1821,6 +1901,7 @@ class LiteLLMProvider(LLMProvider):
         input_tokens = usage.prompt_tokens if usage else 0
         output_tokens = usage.completion_tokens if usage else 0
         cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
+        cost_usd = _extract_cost(response, self.model)
         stop_reason = "tool_calls" if tool_calls else (response.choices[0].finish_reason or "stop")
 
         return LLMResponse(
@@ -1830,6 +1911,7 @@ class LiteLLMProvider(LLMProvider):
             output_tokens=output_tokens,
             cached_tokens=cached_tokens,
             cache_creation_tokens=cache_creation_tokens,
+            cost_usd=cost_usd,
             stop_reason=stop_reason,
             raw_response={
                 "compat_mode": "openrouter_tool_emulation",
@@ -1891,6 +1973,7 @@ class LiteLLMProvider(LLMProvider):
             output_tokens=response.output_tokens,
             cached_tokens=response.cached_tokens,
             cache_creation_tokens=response.cache_creation_tokens,
+            cost_usd=response.cost_usd,
             model=response.model,
         )
 
@@ -1960,6 +2043,7 @@ class LiteLLMProvider(LLMProvider):
             output_tokens=response.output_tokens,
             cached_tokens=response.cached_tokens,
             cache_creation_tokens=response.cache_creation_tokens,
+            cost_usd=response.cost_usd,
             model=response.model,
         )
 
@@ -2286,6 +2370,13 @@ class LiteLLMProvider(LLMProvider):
                             choice.finish_reason,
                             self.model,
                         )
+                        cost_usd = _cost_from_tokens(
+                            self.model,
+                            input_tokens,
+                            output_tokens,
+                            cached_tokens,
+                            cache_creation_tokens,
+                        )
                         tail_events.append(
                             FinishEvent(
                                 stop_reason=choice.finish_reason,
@@ -2293,6 +2384,7 @@ class LiteLLMProvider(LLMProvider):
                                 output_tokens=output_tokens,
                                 cached_tokens=cached_tokens,
                                 cache_creation_tokens=cache_creation_tokens,
+                                cost_usd=cost_usd,
                                 model=self.model,
                             )
                         )
@@ -2335,6 +2427,13 @@ class LiteLLMProvider(LLMProvider):
                                 cache_creation_tokens,
                                 self.model,
                             )
+                            cost_usd = _cost_from_tokens(
+                                self.model,
+                                input_tokens,
+                                output_tokens,
+                                cached_tokens,
+                                cache_creation_tokens,
+                            )
                             # Patch the FinishEvent already queued with 0 tokens
                             for _i, _ev in enumerate(tail_events):
                                 if isinstance(_ev, FinishEvent) and _ev.input_tokens == 0:
@@ -2344,6 +2443,7 @@ class LiteLLMProvider(LLMProvider):
                                         output_tokens=output_tokens,
                                         cached_tokens=cached_tokens,
                                         cache_creation_tokens=cache_creation_tokens,
+                                        cost_usd=cost_usd,
                                         model=_ev.model,
                                     )
                                     break
diff --git a/core/framework/llm/provider.py b/core/framework/llm/provider.py
index 5a95d06d..30af14d8 100644
--- a/core/framework/llm/provider.py
+++ b/core/framework/llm/provider.py
@@ -15,6 +15,10 @@ class LLMResponse:
     ``cached_tokens`` and ``cache_creation_tokens`` are subsets of
     ``input_tokens`` (providers report them inside ``prompt_tokens``).
     Surface them for visibility; do not add to a total.
+
+    ``cost_usd`` is the per-call USD cost when the provider / pricing table
+    can produce one (Anthropic, OpenAI, OpenRouter are supported). 0.0 when
+    unknown or unpriced — treat as "unreported", not "free".
     """
 
     content: str
@@ -23,6 +27,7 @@ class LLMResponse:
     output_tokens: int = 0
     cached_tokens: int = 0
     cache_creation_tokens: int = 0
+    cost_usd: float = 0.0
     stop_reason: str = ""
     raw_response: Any = None
 
@@ -189,6 +194,7 @@ class LLMProvider(ABC):
             output_tokens=response.output_tokens,
             cached_tokens=response.cached_tokens,
             cache_creation_tokens=response.cache_creation_tokens,
+            cost_usd=response.cost_usd,
             model=response.model,
         )
 
diff --git a/core/framework/llm/stream_events.py b/core/framework/llm/stream_events.py
index 5e7c31a7..74c16f85 100644
--- a/core/framework/llm/stream_events.py
+++ b/core/framework/llm/stream_events.py
@@ -70,6 +70,9 @@ class FinishEvent:
     ``cached_tokens`` and ``cache_creation_tokens`` are subsets of
     ``input_tokens`` — providers count both inside ``prompt_tokens`` already.
     Surface them separately for visibility; never add to a total.
+
+    ``cost_usd`` is the per-turn USD cost when the provider or LiteLLM's
+    pricing table supplies one; 0.0 means unreported (not free).
     """
 
     type: Literal["finish"] = "finish"
@@ -78,6 +81,7 @@ class FinishEvent:
     output_tokens: int = 0
     cached_tokens: int = 0
     cache_creation_tokens: int = 0
+    cost_usd: float = 0.0
     model: str = ""
 
 
diff --git a/core/frontend/src/components/ChatPanel.tsx b/core/frontend/src/components/ChatPanel.tsx
index 128ec875..10738e93 100644
--- a/core/frontend/src/components/ChatPanel.tsx
+++ b/core/frontend/src/components/ChatPanel.tsx
@@ -155,7 +155,7 @@ interface ChatPanelProps {
    *  `cached` (cache reads) and `cacheCreated` (cache writes) are subsets of
    *  `input` — providers count both inside prompt_tokens. Display them
    *  separately; do not add to a total. */
-  tokenUsage?: { input: number; output: number; cached?: number; cacheCreated?: number };
+  tokenUsage?: { input: number; output: number; cached?: number; cacheCreated?: number; costUsd?: number };
   /** Optional action element rendered on the right side of the "Conversation" header */
   headerAction?: React.ReactNode;
 }
@@ -1488,17 +1488,35 @@ export default function ChatPanel({
             {hasTokens && (() => {
               const cached = tokenUsage!.cached ?? 0;
               const created = tokenUsage!.cacheCreated ?? 0;
+              const cost = tokenUsage!.costUsd ?? 0;
               // cached/created are subsets of input — never sum; surface separately.
-              const title = [
-                "LLM tokens used this session",
-                `input         ${fmt(tokenUsage!.input)}`,
-                `  cache read  ${fmt(cached)}`,
-                `  cache write ${fmt(created)}`,
-                `output        ${fmt(tokenUsage!.output)}`,
-              ].join("\n");
+              // Cost can be < $0.01; show 4 decimals so small-model sessions aren't "$0.00".
+              const costStr = cost > 0 ? `$${cost.toFixed(4)}` : "—";
               return (
-                <span title={title}>
+                <span className="group relative cursor-help transition-colors hover:text-muted-foreground">
                   Tokens: {fmt(tokenUsage!.output)}
+                  <span
+                    role="tooltip"
+                    className="pointer-events-none invisible absolute bottom-full right-0 z-50 mb-2 whitespace-nowrap rounded-md border border-border bg-popover px-3 py-2 text-[11px] text-popover-foreground opacity-0 shadow-lg transition-[opacity,transform] duration-150 translate-y-1 group-hover:visible group-hover:opacity-100 group-hover:translate-y-0"
+                  >
+                    <span className="mb-1.5 block text-muted-foreground">
+                      LLM tokens used this session
+                    </span>
+                    <span className="grid grid-cols-[auto_1fr] gap-x-4 gap-y-0.5 tabular-nums">
+                      <span>Input</span>
+                      <span className="text-right">{fmt(tokenUsage!.input)}</span>
+                      <span className="pl-3 text-muted-foreground">cache read</span>
+                      <span className="text-right text-muted-foreground">{fmt(cached)}</span>
+                      <span className="pl-3 text-muted-foreground">cache write</span>
+                      <span className="text-right text-muted-foreground">{fmt(created)}</span>
+                      <span>Output</span>
+                      <span className="text-right">{fmt(tokenUsage!.output)}</span>
+                      <span className="mt-1 border-t border-border/50 pt-1">Cost</span>
+                      <span className="mt-1 border-t border-border/50 pt-1 text-right font-medium">
+                        {costStr}
+                      </span>
+                    </span>
+                  </span>
                 </span>
               );
             })()}
diff --git a/core/frontend/src/pages/queen-dm.tsx b/core/frontend/src/pages/queen-dm.tsx
index 9f767e61..dd26690e 100644
--- a/core/frontend/src/pages/queen-dm.tsx
+++ b/core/frontend/src/pages/queen-dm.tsx
@@ -73,11 +73,14 @@ export default function QueenDM() {
   const [awaitingInput, setAwaitingInput] = useState(false);
   // `cached` and `cacheCreated` are subsets of `input` (providers count both
   // inside prompt_tokens already) — display them, never add them to a total.
+  // `costUsd` is the session-total USD cost when the provider supplies one
+  // (Anthropic, OpenAI, OpenRouter); 0 means unreported, not free.
   const [tokenUsage, setTokenUsage] = useState({
     input: 0,
     output: 0,
     cached: 0,
     cacheCreated: 0,
+    costUsd: 0,
   });
   const [historySessions, setHistorySessions] = useState<HistorySession[]>([]);
   const [historyLoading, setHistoryLoading] = useState(false);
@@ -125,7 +128,7 @@ export default function QueenDM() {
     setPendingQuestions(null);
     setAwaitingInput(false);
     setQueenPhase("independent");
-    setTokenUsage({ input: 0, output: 0, cached: 0, cacheCreated: 0 });
+    setTokenUsage({ input: 0, output: 0, cached: 0, cacheCreated: 0, costUsd: 0 });
     setInitialDraft(null);
     setColonySpawned(false);
     setSpawnedColonyName(null);
@@ -587,11 +590,13 @@ export default function QueenDM() {
             // separately for display, do NOT roll into input/total.
             const cached = (event.data.cached_tokens as number) || 0;
             const cacheCreated = (event.data.cache_creation_tokens as number) || 0;
+            const costUsd = (event.data.cost_usd as number) || 0;
             setTokenUsage((prev) => ({
               input: prev.input + inp,
               output: prev.output + out,
               cached: prev.cached + cached,
               cacheCreated: prev.cacheCreated + cacheCreated,
+              costUsd: prev.costUsd + costUsd,
             }));
           }
           // Flush one queued message per LLM turn boundary. This is the
diff --git a/core/tests/test_event_bus.py b/core/tests/test_event_bus.py
index 5e5facb0..b10f334f 100644
--- a/core/tests/test_event_bus.py
+++ b/core/tests/test_event_bus.py
@@ -814,6 +814,7 @@ class TestConveniencePublishers:
             output_tokens=50,
             cached_tokens=30,
             cache_creation_tokens=10,
+            cost_usd=0.0042,
             execution_id="exec_1",
             iteration=3,
         )
@@ -828,6 +829,7 @@ class TestConveniencePublishers:
         # display, NOT additive to input_tokens.
         assert received[0].data["cached_tokens"] == 30
         assert received[0].data["cache_creation_tokens"] == 10
+        assert received[0].data["cost_usd"] == 0.0042
         assert received[0].data["iteration"] == 3
 
     @pytest.mark.asyncio
diff --git a/core/tests/test_litellm_provider.py b/core/tests/test_litellm_provider.py
index 18ab1123..9f795d84 100644
--- a/core/tests/test_litellm_provider.py
+++ b/core/tests/test_litellm_provider.py
@@ -25,8 +25,10 @@ from framework.llm.litellm import (
     LiteLLMProvider,
     _build_system_message,
     _compute_retry_delay,
+    _cost_from_tokens,
     _ensure_ollama_chat_prefix,
     _extract_cache_tokens,
+    _extract_cost,
     _is_ollama_model,
     _model_supports_cache_control,
     _summarize_request_for_log,
@@ -1512,3 +1514,124 @@ class TestStreamingChunksFallbackPreservesCacheFields:
 
         assert cached == 5601
         assert creation == 0
+
+
+class TestExtractCost:
+    """`_extract_cost` pulls USD cost from three sources in order:
+    usage.cost (OpenRouter native / include_cost_in_streaming_usage) →
+    response._hidden_params['response_cost'] (LiteLLM logging) →
+    litellm.completion_cost() (pricing-table fallback)."""
+
+    def test_none_response_returns_zero(self):
+        assert _extract_cost(None, "gpt-4o-mini") == 0.0
+
+    def test_openrouter_usage_cost_is_preferred(self):
+        """OpenRouter returns authoritative per-call cost on usage.cost when
+        the caller opts in (usage.include=true). That beats LiteLLM's
+        pricing-table estimate because it reflects promo pricing and BYOK markup."""
+        response = MagicMock()
+        response.usage = MagicMock(cost=0.00123)
+        response._hidden_params = {"response_cost": 99.99}  # should be ignored
+        assert _extract_cost(response, "openrouter/anthropic/claude-opus-4.5") == 0.00123
+
+    def test_hidden_params_response_cost_used_when_no_usage_cost(self):
+        """LiteLLM's logging layer attaches response_cost after most
+        completions — this is how OpenAI/Anthropic responses get costed
+        without going back to the pricing table."""
+        response = MagicMock()
+        response.usage = MagicMock(spec=[])  # no .cost attribute
+        response._hidden_params = {"response_cost": 0.0042}
+        assert _extract_cost(response, "gpt-4o-mini") == 0.0042
+
+    def test_falls_back_to_completion_cost_when_nothing_pre_populated(self):
+        """For providers where LiteLLM didn't pre-populate cost, call
+        litellm.completion_cost() against the pricing table. Mocked here
+        because we don't want tests depending on the exact price of
+        claude-sonnet-4.5 in LiteLLM's model map."""
+        response = MagicMock()
+        response.usage = MagicMock(spec=[])
+        response._hidden_params = {}
+        with patch("litellm.completion_cost", return_value=0.00789):
+            assert _extract_cost(response, "anthropic/claude-sonnet-4.5") == 0.00789
+
+    def test_completion_cost_exception_returns_zero(self):
+        """Unpriced models (e.g. new OpenRouter routes not yet in LiteLLM's
+        catalog) must not crash the hot path."""
+        response = MagicMock()
+        response.usage = MagicMock(spec=[])
+        response._hidden_params = {}
+        with patch("litellm.completion_cost", side_effect=Exception("no pricing")):
+            assert _extract_cost(response, "openrouter/mystery/model") == 0.0
+
+    def test_zero_cost_falls_through_to_next_source(self):
+        """usage.cost == 0 should NOT short-circuit; fall through to
+        _hidden_params / completion_cost so we don't cement a false zero."""
+        response = MagicMock()
+        response.usage = MagicMock(cost=0.0)
+        response._hidden_params = {"response_cost": 0.0055}
+        assert _extract_cost(response, "gpt-4o-mini") == 0.0055
+
+
+class TestCostFromTokens:
+    """`_cost_from_tokens` is the streaming-path cost helper: stream wrappers
+    don't expose the full ModelResponse shape that completion_cost() expects,
+    so we go through cost_per_token() with the already-extracted totals."""
+
+    def test_zero_tokens_returns_zero_without_calling_litellm(self):
+        with patch("litellm.cost_per_token") as mock:
+            assert _cost_from_tokens("claude-opus-4.5", 0, 0) == 0.0
+            mock.assert_not_called()
+
+    def test_empty_model_returns_zero(self):
+        assert _cost_from_tokens("", 1000, 500) == 0.0
+
+    def test_computes_from_tokens(self):
+        with patch("litellm.cost_per_token", return_value=(0.001, 0.002)) as mock:
+            cost = _cost_from_tokens(
+                "anthropic/claude-opus-4.5",
+                input_tokens=1000,
+                output_tokens=500,
+                cached_tokens=200,
+                cache_creation_tokens=100,
+            )
+        assert cost == pytest.approx(0.003)
+        # Verify the cache-aware kwargs are threaded through — Anthropic
+        # needs these to apply the 1.25x write / 0.1x read multipliers.
+        call_kwargs = mock.call_args.kwargs
+        assert call_kwargs["prompt_tokens"] == 1000
+        assert call_kwargs["completion_tokens"] == 500
+        assert call_kwargs["cache_read_input_tokens"] == 200
+        assert call_kwargs["cache_creation_input_tokens"] == 100
+
+    def test_exception_returns_zero(self):
+        with patch("litellm.cost_per_token", side_effect=Exception("unpriced")):
+            assert _cost_from_tokens("mystery/model", 1000, 500) == 0.0
+
+    def test_negative_or_none_components_coerce_to_zero(self):
+        """LiteLLM returns (None, None) for unknown models in some versions;
+        treat as 0 rather than crashing on None+None."""
+        with patch("litellm.cost_per_token", return_value=(None, None)):
+            assert _cost_from_tokens("some/model", 1, 1) == 0.0
+
+
+class TestLLMResponseAndFinishEventHaveCostUsd:
+    """Regression: both LLMResponse and FinishEvent must carry cost_usd so
+    the agent loop → event bus → frontend pipeline doesn't lose cost."""
+
+    def test_llm_response_defaults_cost_to_zero(self):
+        from framework.llm.provider import LLMResponse
+
+        r = LLMResponse(content="", model="m")
+        assert r.cost_usd == 0.0
+
+    def test_finish_event_defaults_cost_to_zero(self):
+        from framework.llm.stream_events import FinishEvent
+
+        e = FinishEvent()
+        assert e.cost_usd == 0.0
+
+    def test_finish_event_accepts_cost(self):
+        from framework.llm.stream_events import FinishEvent
+
+        e = FinishEvent(cost_usd=0.0123)
+        assert e.cost_usd == 0.0123
diff --git a/core/tests/test_stream_events.py b/core/tests/test_stream_events.py
index 105097ed..a8c0cadb 100644
--- a/core/tests/test_stream_events.py
+++ b/core/tests/test_stream_events.py
@@ -242,6 +242,7 @@ class TestEventSerialization:
             "output_tokens": 20,
             "cached_tokens": 0,
             "cache_creation_tokens": 0,
+            "cost_usd": 0.0,
             "model": "gpt-4",
         }