feat: cost tracking

2026-04-23 15:34:07 -07:00
parent 8c6428f445
commit 77cc169606
11 changed files with 283 additions and 11 deletions
@@ -940,6 +940,7 @@ class AgentLoop(AgentProtocol):
                        output_tokens=turn_tokens.get("output", 0),
                        cached_tokens=turn_tokens.get("cached", 0),
                        cache_creation_tokens=turn_tokens.get("cache_creation", 0),
+                        cost_usd=float(turn_tokens.get("cost", 0.0) or 0.0),
                        execution_id=execution_id,
                        iteration=iteration,
                    )
@@ -2340,7 +2341,9 @@ class AgentLoop(AgentProtocol):
        stream_id = ctx.stream_id or ctx.agent_id
        node_id = ctx.agent_id
        execution_id = ctx.execution_id or ""
-        token_counts: dict[str, int] = {"input": 0, "output": 0, "cached": 0, "cache_creation": 0}
+        # Mixed-type dict: int token counts + str stop_reason/model + float cost.
+        # Typed loosely to avoid churn in the many call sites that read from it.
+        token_counts: dict[str, Any] = {"input": 0, "output": 0, "cached": 0, "cache_creation": 0, "cost": 0.0}
        tool_call_count = 0
        final_text = ""
        final_system_prompt = conversation.system_prompt
@@ -2572,6 +2575,7 @@ class AgentLoop(AgentProtocol):
                        token_counts["output"] += event.output_tokens
                        token_counts["cached"] += event.cached_tokens
                        token_counts["cache_creation"] += event.cache_creation_tokens
+                        token_counts["cost"] = token_counts.get("cost", 0.0) + event.cost_usd
                        token_counts["stop_reason"] = event.stop_reason
                        token_counts["model"] = event.model

@@ -4154,6 +4158,7 @@ class AgentLoop(AgentProtocol):
        output_tokens: int,
        cached_tokens: int = 0,
        cache_creation_tokens: int = 0,
+        cost_usd: float = 0.0,
        execution_id: str = "",
        iteration: int | None = None,
    ) -> None:
@@ -4167,6 +4172,7 @@ class AgentLoop(AgentProtocol):
            output_tokens=output_tokens,
            cached_tokens=cached_tokens,
            cache_creation_tokens=cache_creation_tokens,
+            cost_usd=cost_usd,
            execution_id=execution_id,
            iteration=iteration,
        )
@@ -109,6 +109,7 @@ async def publish_llm_turn_complete(
    output_tokens: int,
    cached_tokens: int = 0,
    cache_creation_tokens: int = 0,
+    cost_usd: float = 0.0,
    execution_id: str = "",
    iteration: int | None = None,
 ) -> None:
@@ -122,6 +123,7 @@ async def publish_llm_turn_complete(
            output_tokens=output_tokens,
            cached_tokens=cached_tokens,
            cache_creation_tokens=cache_creation_tokens,
+            cost_usd=cost_usd,
            execution_id=execution_id,
            iteration=iteration,
        )
@@ -810,6 +810,7 @@ class EventBus:
        output_tokens: int,
        cached_tokens: int = 0,
        cache_creation_tokens: int = 0,
+        cost_usd: float = 0.0,
        execution_id: str | None = None,
        iteration: int | None = None,
    ) -> None:
@@ -818,6 +819,9 @@ class EventBus:
        ``cached_tokens`` and ``cache_creation_tokens`` are subsets of
        ``input_tokens`` (already inside provider ``prompt_tokens``).
        Subscribers should display them, not add them to a total.
+
+        ``cost_usd`` is the USD cost for this turn when known (Anthropic,
+        OpenAI, OpenRouter). 0.0 means unreported (not free).
        """
        data: dict = {
            "stop_reason": stop_reason,
@@ -826,6 +830,7 @@ class EventBus:
            "output_tokens": output_tokens,
            "cached_tokens": cached_tokens,
            "cache_creation_tokens": cache_creation_tokens,
+            "cost_usd": cost_usd,
        }
        if iteration is not None:
            data["iteration"] = iteration
@@ -360,6 +360,82 @@ FAILED_REQUESTS_DIR = Path.home() / ".hive" / "failed_requests"
 MAX_FAILED_REQUEST_DUMPS = 50


+def _extract_cost(response: Any, model: str) -> float:
+    """Pull the USD cost for a non-streaming completion response.
+
+    Sources checked, in priority order:
+      1. ``usage.cost`` — populated when OpenRouter returns native cost via
+         ``usage: {include: true}`` or when ``litellm.include_cost_in_streaming_usage``
+         is on.
+      2. ``response._hidden_params["response_cost"]`` — set by LiteLLM's
+         logging layer after most successful completions.
+      3. ``litellm.completion_cost(...)`` — computes from the model pricing
+         table; works across Anthropic, OpenAI, and OpenRouter as long as the
+         model is in LiteLLM's catalog.
+
+    Returns 0.0 for unpriced models or unexpected response shapes — cost is a
+    display concern, never let it break the hot path. For streaming paths
+    where the aggregate response isn't a full ``ModelResponse``, use
+    :func:`_cost_from_tokens` with the already-extracted token counts.
+    """
+    if response is None:
+        return 0.0
+    usage = getattr(response, "usage", None)
+    usage_cost = getattr(usage, "cost", None) if usage is not None else None
+    if isinstance(usage_cost, (int, float)) and usage_cost > 0:
+        return float(usage_cost)
+
+    hidden = getattr(response, "_hidden_params", None)
+    if isinstance(hidden, dict):
+        hp_cost = hidden.get("response_cost")
+        if isinstance(hp_cost, (int, float)) and hp_cost > 0:
+            return float(hp_cost)
+
+    try:
+        import litellm as _litellm
+
+        computed = _litellm.completion_cost(completion_response=response, model=model)
+        if isinstance(computed, (int, float)) and computed > 0:
+            return float(computed)
+    except Exception as exc:
+        logger.debug("[cost] completion_cost failed for %s: %s", model, exc)
+    return 0.0
+
+
+def _cost_from_tokens(
+    model: str,
+    input_tokens: int,
+    output_tokens: int,
+    cached_tokens: int = 0,
+    cache_creation_tokens: int = 0,
+) -> float:
+    """Compute USD cost from already-normalized token counts.
+
+    Used on streaming paths where the aggregate ``response`` is the stream
+    wrapper (not a full ``ModelResponse``) and ``litellm.completion_cost`` on
+    it either no-ops or raises. Calls ``litellm.cost_per_token`` directly
+    with the cache-aware inputs so Anthropic's 5-min-write / cache-read
+    multipliers are applied correctly.
+    """
+    if not model or (input_tokens == 0 and output_tokens == 0):
+        return 0.0
+    try:
+        import litellm as _litellm
+
+        prompt_cost, completion_cost = _litellm.cost_per_token(
+            model=model,
+            prompt_tokens=input_tokens,
+            completion_tokens=output_tokens,
+            cache_read_input_tokens=cached_tokens,
+            cache_creation_input_tokens=cache_creation_tokens,
+        )
+        total = (prompt_cost or 0.0) + (completion_cost or 0.0)
+        return float(total) if total > 0 else 0.0
+    except Exception as exc:
+        logger.debug("[cost] cost_per_token failed for %s: %s", model, exc)
+        return 0.0
+
+
 def _extract_cache_tokens(usage: Any) -> tuple[int, int]:
    """Pull (cache_read, cache_creation) from a LiteLLM usage object.

@@ -1115,6 +1191,7 @@ class LiteLLMProvider(LLMProvider):
        input_tokens = usage.prompt_tokens if usage else 0
        output_tokens = usage.completion_tokens if usage else 0
        cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
+        cost_usd = _extract_cost(response, self.model)

        return LLMResponse(
            content=content,
@@ -1123,6 +1200,7 @@ class LiteLLMProvider(LLMProvider):
            output_tokens=output_tokens,
            cached_tokens=cached_tokens,
            cache_creation_tokens=cache_creation_tokens,
+            cost_usd=cost_usd,
            stop_reason=response.choices[0].finish_reason or "",
            raw_response=response,
        )
@@ -1338,6 +1416,7 @@ class LiteLLMProvider(LLMProvider):
        input_tokens = usage.prompt_tokens if usage else 0
        output_tokens = usage.completion_tokens if usage else 0
        cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
+        cost_usd = _extract_cost(response, self.model)

        return LLMResponse(
            content=content,
@@ -1346,6 +1425,7 @@ class LiteLLMProvider(LLMProvider):
            output_tokens=output_tokens,
            cached_tokens=cached_tokens,
            cache_creation_tokens=cache_creation_tokens,
+            cost_usd=cost_usd,
            stop_reason=response.choices[0].finish_reason or "",
            raw_response=response,
        )
@@ -1821,6 +1901,7 @@ class LiteLLMProvider(LLMProvider):
        input_tokens = usage.prompt_tokens if usage else 0
        output_tokens = usage.completion_tokens if usage else 0
        cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
+        cost_usd = _extract_cost(response, self.model)
        stop_reason = "tool_calls" if tool_calls else (response.choices[0].finish_reason or "stop")

        return LLMResponse(
@@ -1830,6 +1911,7 @@ class LiteLLMProvider(LLMProvider):
            output_tokens=output_tokens,
            cached_tokens=cached_tokens,
            cache_creation_tokens=cache_creation_tokens,
+            cost_usd=cost_usd,
            stop_reason=stop_reason,
            raw_response={
                "compat_mode": "openrouter_tool_emulation",
@@ -1891,6 +1973,7 @@ class LiteLLMProvider(LLMProvider):
            output_tokens=response.output_tokens,
            cached_tokens=response.cached_tokens,
            cache_creation_tokens=response.cache_creation_tokens,
+            cost_usd=response.cost_usd,
            model=response.model,
        )

@@ -1960,6 +2043,7 @@ class LiteLLMProvider(LLMProvider):
            output_tokens=response.output_tokens,
            cached_tokens=response.cached_tokens,
            cache_creation_tokens=response.cache_creation_tokens,
+            cost_usd=response.cost_usd,
            model=response.model,
        )

@@ -2286,6 +2370,13 @@ class LiteLLMProvider(LLMProvider):
                            choice.finish_reason,
                            self.model,
                        )
+                        cost_usd = _cost_from_tokens(
+                            self.model,
+                            input_tokens,
+                            output_tokens,
+                            cached_tokens,
+                            cache_creation_tokens,
+                        )
                        tail_events.append(
                            FinishEvent(
                                stop_reason=choice.finish_reason,
@@ -2293,6 +2384,7 @@ class LiteLLMProvider(LLMProvider):
                                output_tokens=output_tokens,
                                cached_tokens=cached_tokens,
                                cache_creation_tokens=cache_creation_tokens,
+                                cost_usd=cost_usd,
                                model=self.model,
                            )
                        )
@@ -2335,6 +2427,13 @@ class LiteLLMProvider(LLMProvider):
                                cache_creation_tokens,
                                self.model,
                            )
+                            cost_usd = _cost_from_tokens(
+                                self.model,
+                                input_tokens,
+                                output_tokens,
+                                cached_tokens,
+                                cache_creation_tokens,
+                            )
                            # Patch the FinishEvent already queued with 0 tokens
                            for _i, _ev in enumerate(tail_events):
                                if isinstance(_ev, FinishEvent) and _ev.input_tokens == 0:
@@ -2344,6 +2443,7 @@ class LiteLLMProvider(LLMProvider):
                                        output_tokens=output_tokens,
                                        cached_tokens=cached_tokens,
                                        cache_creation_tokens=cache_creation_tokens,
+                                        cost_usd=cost_usd,
                                        model=_ev.model,
                                    )
                                    break
@@ -15,6 +15,10 @@ class LLMResponse:
    ``cached_tokens`` and ``cache_creation_tokens`` are subsets of
    ``input_tokens`` (providers report them inside ``prompt_tokens``).
    Surface them for visibility; do not add to a total.
+
+    ``cost_usd`` is the per-call USD cost when the provider / pricing table
+    can produce one (Anthropic, OpenAI, OpenRouter are supported). 0.0 when
+    unknown or unpriced — treat as "unreported", not "free".
    """

    content: str
@@ -23,6 +27,7 @@ class LLMResponse:
    output_tokens: int = 0
    cached_tokens: int = 0
    cache_creation_tokens: int = 0
+    cost_usd: float = 0.0
    stop_reason: str = ""
    raw_response: Any = None

@@ -189,6 +194,7 @@ class LLMProvider(ABC):
            output_tokens=response.output_tokens,
            cached_tokens=response.cached_tokens,
            cache_creation_tokens=response.cache_creation_tokens,
+            cost_usd=response.cost_usd,
            model=response.model,
        )

@@ -70,6 +70,9 @@ class FinishEvent:
    ``cached_tokens`` and ``cache_creation_tokens`` are subsets of
    ``input_tokens`` — providers count both inside ``prompt_tokens`` already.
    Surface them separately for visibility; never add to a total.
+
+    ``cost_usd`` is the per-turn USD cost when the provider or LiteLLM's
+    pricing table supplies one; 0.0 means unreported (not free).
    """

    type: Literal["finish"] = "finish"
@@ -78,6 +81,7 @@ class FinishEvent:
    output_tokens: int = 0
    cached_tokens: int = 0
    cache_creation_tokens: int = 0
+    cost_usd: float = 0.0
    model: str = ""


@@ -155,7 +155,7 @@ interface ChatPanelProps {
   *  `cached` (cache reads) and `cacheCreated` (cache writes) are subsets of
   *  `input` — providers count both inside prompt_tokens. Display them
   *  separately; do not add to a total. */
-  tokenUsage?: { input: number; output: number; cached?: number; cacheCreated?: number };
+  tokenUsage?: { input: number; output: number; cached?: number; cacheCreated?: number; costUsd?: number };
  /** Optional action element rendered on the right side of the "Conversation" header */
  headerAction?: React.ReactNode;
 }
@@ -1488,17 +1488,35 @@ export default function ChatPanel({
            {hasTokens && (() => {
              const cached = tokenUsage!.cached ?? 0;
              const created = tokenUsage!.cacheCreated ?? 0;
+              const cost = tokenUsage!.costUsd ?? 0;
              // cached/created are subsets of input — never sum; surface separately.
-              const title = [
-                "LLM tokens used this session",
-                `input         ${fmt(tokenUsage!.input)}`,
-                `  cache read  ${fmt(cached)}`,
-                `  cache write ${fmt(created)}`,
-                `output        ${fmt(tokenUsage!.output)}`,
-              ].join("\n");
+              // Cost can be < $0.01; show 4 decimals so small-model sessions aren't "$0.00".
+              const costStr = cost > 0 ? `$${cost.toFixed(4)}` : "—";
              return (
-                <span title={title}>
+                <span className="group relative cursor-help transition-colors hover:text-muted-foreground">
                  Tokens: {fmt(tokenUsage!.output)}
+                  <span
+                    role="tooltip"
+                    className="pointer-events-none invisible absolute bottom-full right-0 z-50 mb-2 whitespace-nowrap rounded-md border border-border bg-popover px-3 py-2 text-[11px] text-popover-foreground opacity-0 shadow-lg transition-[opacity,transform] duration-150 translate-y-1 group-hover:visible group-hover:opacity-100 group-hover:translate-y-0"
+                  >
+                    <span className="mb-1.5 block text-muted-foreground">
+                      LLM tokens used this session
+                    </span>
+                    <span className="grid grid-cols-[auto_1fr] gap-x-4 gap-y-0.5 tabular-nums">
+                      <span>Input</span>
+                      <span className="text-right">{fmt(tokenUsage!.input)}</span>
+                      <span className="pl-3 text-muted-foreground">cache read</span>
+                      <span className="text-right text-muted-foreground">{fmt(cached)}</span>
+                      <span className="pl-3 text-muted-foreground">cache write</span>
+                      <span className="text-right text-muted-foreground">{fmt(created)}</span>
+                      <span>Output</span>
+                      <span className="text-right">{fmt(tokenUsage!.output)}</span>
+                      <span className="mt-1 border-t border-border/50 pt-1">Cost</span>
+                      <span className="mt-1 border-t border-border/50 pt-1 text-right font-medium">
+                        {costStr}
+                      </span>
+                    </span>
+                  </span>
                </span>
              );
            })()}
@@ -73,11 +73,14 @@ export default function QueenDM() {
  const [awaitingInput, setAwaitingInput] = useState(false);
  // `cached` and `cacheCreated` are subsets of `input` (providers count both
  // inside prompt_tokens already) — display them, never add them to a total.
+  // `costUsd` is the session-total USD cost when the provider supplies one
+  // (Anthropic, OpenAI, OpenRouter); 0 means unreported, not free.
  const [tokenUsage, setTokenUsage] = useState({
    input: 0,
    output: 0,
    cached: 0,
    cacheCreated: 0,
+    costUsd: 0,
  });
  const [historySessions, setHistorySessions] = useState<HistorySession[]>([]);
  const [historyLoading, setHistoryLoading] = useState(false);
@@ -125,7 +128,7 @@ export default function QueenDM() {
    setPendingQuestions(null);
    setAwaitingInput(false);
    setQueenPhase("independent");
-    setTokenUsage({ input: 0, output: 0, cached: 0, cacheCreated: 0 });
+    setTokenUsage({ input: 0, output: 0, cached: 0, cacheCreated: 0, costUsd: 0 });
    setInitialDraft(null);
    setColonySpawned(false);
    setSpawnedColonyName(null);
@@ -587,11 +590,13 @@ export default function QueenDM() {
            // separately for display, do NOT roll into input/total.
            const cached = (event.data.cached_tokens as number) || 0;
            const cacheCreated = (event.data.cache_creation_tokens as number) || 0;
+            const costUsd = (event.data.cost_usd as number) || 0;
            setTokenUsage((prev) => ({
              input: prev.input + inp,
              output: prev.output + out,
              cached: prev.cached + cached,
              cacheCreated: prev.cacheCreated + cacheCreated,
+              costUsd: prev.costUsd + costUsd,
            }));
          }
          // Flush one queued message per LLM turn boundary. This is the
@@ -814,6 +814,7 @@ class TestConveniencePublishers:
            output_tokens=50,
            cached_tokens=30,
            cache_creation_tokens=10,
+            cost_usd=0.0042,
            execution_id="exec_1",
            iteration=3,
        )
@@ -828,6 +829,7 @@ class TestConveniencePublishers:
        # display, NOT additive to input_tokens.
        assert received[0].data["cached_tokens"] == 30
        assert received[0].data["cache_creation_tokens"] == 10
+        assert received[0].data["cost_usd"] == 0.0042
        assert received[0].data["iteration"] == 3

    @pytest.mark.asyncio
@@ -25,8 +25,10 @@ from framework.llm.litellm import (
    LiteLLMProvider,
    _build_system_message,
    _compute_retry_delay,
+    _cost_from_tokens,
    _ensure_ollama_chat_prefix,
    _extract_cache_tokens,
+    _extract_cost,
    _is_ollama_model,
    _model_supports_cache_control,
    _summarize_request_for_log,
@@ -1512,3 +1514,124 @@ class TestStreamingChunksFallbackPreservesCacheFields:

        assert cached == 5601
        assert creation == 0
+
+
+class TestExtractCost:
+    """`_extract_cost` pulls USD cost from three sources in order:
+    usage.cost (OpenRouter native / include_cost_in_streaming_usage) →
+    response._hidden_params['response_cost'] (LiteLLM logging) →
+    litellm.completion_cost() (pricing-table fallback)."""
+
+    def test_none_response_returns_zero(self):
+        assert _extract_cost(None, "gpt-4o-mini") == 0.0
+
+    def test_openrouter_usage_cost_is_preferred(self):
+        """OpenRouter returns authoritative per-call cost on usage.cost when
+        the caller opts in (usage.include=true). That beats LiteLLM's
+        pricing-table estimate because it reflects promo pricing and BYOK markup."""
+        response = MagicMock()
+        response.usage = MagicMock(cost=0.00123)
+        response._hidden_params = {"response_cost": 99.99}  # should be ignored
+        assert _extract_cost(response, "openrouter/anthropic/claude-opus-4.5") == 0.00123
+
+    def test_hidden_params_response_cost_used_when_no_usage_cost(self):
+        """LiteLLM's logging layer attaches response_cost after most
+        completions — this is how OpenAI/Anthropic responses get costed
+        without going back to the pricing table."""
+        response = MagicMock()
+        response.usage = MagicMock(spec=[])  # no .cost attribute
+        response._hidden_params = {"response_cost": 0.0042}
+        assert _extract_cost(response, "gpt-4o-mini") == 0.0042
+
+    def test_falls_back_to_completion_cost_when_nothing_pre_populated(self):
+        """For providers where LiteLLM didn't pre-populate cost, call
+        litellm.completion_cost() against the pricing table. Mocked here
+        because we don't want tests depending on the exact price of
+        claude-sonnet-4.5 in LiteLLM's model map."""
+        response = MagicMock()
+        response.usage = MagicMock(spec=[])
+        response._hidden_params = {}
+        with patch("litellm.completion_cost", return_value=0.00789):
+            assert _extract_cost(response, "anthropic/claude-sonnet-4.5") == 0.00789
+
+    def test_completion_cost_exception_returns_zero(self):
+        """Unpriced models (e.g. new OpenRouter routes not yet in LiteLLM's
+        catalog) must not crash the hot path."""
+        response = MagicMock()
+        response.usage = MagicMock(spec=[])
+        response._hidden_params = {}
+        with patch("litellm.completion_cost", side_effect=Exception("no pricing")):
+            assert _extract_cost(response, "openrouter/mystery/model") == 0.0
+
+    def test_zero_cost_falls_through_to_next_source(self):
+        """usage.cost == 0 should NOT short-circuit; fall through to
+        _hidden_params / completion_cost so we don't cement a false zero."""
+        response = MagicMock()
+        response.usage = MagicMock(cost=0.0)
+        response._hidden_params = {"response_cost": 0.0055}
+        assert _extract_cost(response, "gpt-4o-mini") == 0.0055
+
+
+class TestCostFromTokens:
+    """`_cost_from_tokens` is the streaming-path cost helper: stream wrappers
+    don't expose the full ModelResponse shape that completion_cost() expects,
+    so we go through cost_per_token() with the already-extracted totals."""
+
+    def test_zero_tokens_returns_zero_without_calling_litellm(self):
+        with patch("litellm.cost_per_token") as mock:
+            assert _cost_from_tokens("claude-opus-4.5", 0, 0) == 0.0
+            mock.assert_not_called()
+
+    def test_empty_model_returns_zero(self):
+        assert _cost_from_tokens("", 1000, 500) == 0.0
+
+    def test_computes_from_tokens(self):
+        with patch("litellm.cost_per_token", return_value=(0.001, 0.002)) as mock:
+            cost = _cost_from_tokens(
+                "anthropic/claude-opus-4.5",
+                input_tokens=1000,
+                output_tokens=500,
+                cached_tokens=200,
+                cache_creation_tokens=100,
+            )
+        assert cost == pytest.approx(0.003)
+        # Verify the cache-aware kwargs are threaded through — Anthropic
+        # needs these to apply the 1.25x write / 0.1x read multipliers.
+        call_kwargs = mock.call_args.kwargs
+        assert call_kwargs["prompt_tokens"] == 1000
+        assert call_kwargs["completion_tokens"] == 500
+        assert call_kwargs["cache_read_input_tokens"] == 200
+        assert call_kwargs["cache_creation_input_tokens"] == 100
+
+    def test_exception_returns_zero(self):
+        with patch("litellm.cost_per_token", side_effect=Exception("unpriced")):
+            assert _cost_from_tokens("mystery/model", 1000, 500) == 0.0
+
+    def test_negative_or_none_components_coerce_to_zero(self):
+        """LiteLLM returns (None, None) for unknown models in some versions;
+        treat as 0 rather than crashing on None+None."""
+        with patch("litellm.cost_per_token", return_value=(None, None)):
+            assert _cost_from_tokens("some/model", 1, 1) == 0.0
+
+
+class TestLLMResponseAndFinishEventHaveCostUsd:
+    """Regression: both LLMResponse and FinishEvent must carry cost_usd so
+    the agent loop → event bus → frontend pipeline doesn't lose cost."""
+
+    def test_llm_response_defaults_cost_to_zero(self):
+        from framework.llm.provider import LLMResponse
+
+        r = LLMResponse(content="", model="m")
+        assert r.cost_usd == 0.0
+
+    def test_finish_event_defaults_cost_to_zero(self):
+        from framework.llm.stream_events import FinishEvent
+
+        e = FinishEvent()
+        assert e.cost_usd == 0.0
+
+    def test_finish_event_accepts_cost(self):
+        from framework.llm.stream_events import FinishEvent
+
+        e = FinishEvent(cost_usd=0.0123)
+        assert e.cost_usd == 0.0123
@@ -242,6 +242,7 @@ class TestEventSerialization:
            "output_tokens": 20,
            "cached_tokens": 0,
            "cache_creation_tokens": 0,
+            "cost_usd": 0.0,
            "model": "gpt-4",
        }