From 77cc1696068cc7107f9d5ad05901380a4feae23e Mon Sep 17 00:00:00 2001 From: Richard Tang Date: Thu, 23 Apr 2026 15:34:07 -0700 Subject: [PATCH] feat: cost tracking --- core/framework/agent_loop/agent_loop.py | 8 +- .../agent_loop/internals/event_publishing.py | 2 + core/framework/host/event_bus.py | 5 + core/framework/llm/litellm.py | 100 ++++++++++++++ core/framework/llm/provider.py | 6 + core/framework/llm/stream_events.py | 4 + core/frontend/src/components/ChatPanel.tsx | 36 +++-- core/frontend/src/pages/queen-dm.tsx | 7 +- core/tests/test_event_bus.py | 2 + core/tests/test_litellm_provider.py | 123 ++++++++++++++++++ core/tests/test_stream_events.py | 1 + 11 files changed, 283 insertions(+), 11 deletions(-) diff --git a/core/framework/agent_loop/agent_loop.py b/core/framework/agent_loop/agent_loop.py index 7ad47619..9e7ab2c3 100644 --- a/core/framework/agent_loop/agent_loop.py +++ b/core/framework/agent_loop/agent_loop.py @@ -940,6 +940,7 @@ class AgentLoop(AgentProtocol): output_tokens=turn_tokens.get("output", 0), cached_tokens=turn_tokens.get("cached", 0), cache_creation_tokens=turn_tokens.get("cache_creation", 0), + cost_usd=float(turn_tokens.get("cost", 0.0) or 0.0), execution_id=execution_id, iteration=iteration, ) @@ -2340,7 +2341,9 @@ class AgentLoop(AgentProtocol): stream_id = ctx.stream_id or ctx.agent_id node_id = ctx.agent_id execution_id = ctx.execution_id or "" - token_counts: dict[str, int] = {"input": 0, "output": 0, "cached": 0, "cache_creation": 0} + # Mixed-type dict: int token counts + str stop_reason/model + float cost. + # Typed loosely to avoid churn in the many call sites that read from it. + token_counts: dict[str, Any] = {"input": 0, "output": 0, "cached": 0, "cache_creation": 0, "cost": 0.0} tool_call_count = 0 final_text = "" final_system_prompt = conversation.system_prompt @@ -2572,6 +2575,7 @@ class AgentLoop(AgentProtocol): token_counts["output"] += event.output_tokens token_counts["cached"] += event.cached_tokens token_counts["cache_creation"] += event.cache_creation_tokens + token_counts["cost"] = token_counts.get("cost", 0.0) + event.cost_usd token_counts["stop_reason"] = event.stop_reason token_counts["model"] = event.model @@ -4154,6 +4158,7 @@ class AgentLoop(AgentProtocol): output_tokens: int, cached_tokens: int = 0, cache_creation_tokens: int = 0, + cost_usd: float = 0.0, execution_id: str = "", iteration: int | None = None, ) -> None: @@ -4167,6 +4172,7 @@ class AgentLoop(AgentProtocol): output_tokens=output_tokens, cached_tokens=cached_tokens, cache_creation_tokens=cache_creation_tokens, + cost_usd=cost_usd, execution_id=execution_id, iteration=iteration, ) diff --git a/core/framework/agent_loop/internals/event_publishing.py b/core/framework/agent_loop/internals/event_publishing.py index cac4f9e4..87ba2a69 100644 --- a/core/framework/agent_loop/internals/event_publishing.py +++ b/core/framework/agent_loop/internals/event_publishing.py @@ -109,6 +109,7 @@ async def publish_llm_turn_complete( output_tokens: int, cached_tokens: int = 0, cache_creation_tokens: int = 0, + cost_usd: float = 0.0, execution_id: str = "", iteration: int | None = None, ) -> None: @@ -122,6 +123,7 @@ async def publish_llm_turn_complete( output_tokens=output_tokens, cached_tokens=cached_tokens, cache_creation_tokens=cache_creation_tokens, + cost_usd=cost_usd, execution_id=execution_id, iteration=iteration, ) diff --git a/core/framework/host/event_bus.py b/core/framework/host/event_bus.py index c72fc679..9c17dad4 100644 --- a/core/framework/host/event_bus.py +++ b/core/framework/host/event_bus.py @@ -810,6 +810,7 @@ class EventBus: output_tokens: int, cached_tokens: int = 0, cache_creation_tokens: int = 0, + cost_usd: float = 0.0, execution_id: str | None = None, iteration: int | None = None, ) -> None: @@ -818,6 +819,9 @@ class EventBus: ``cached_tokens`` and ``cache_creation_tokens`` are subsets of ``input_tokens`` (already inside provider ``prompt_tokens``). Subscribers should display them, not add them to a total. + + ``cost_usd`` is the USD cost for this turn when known (Anthropic, + OpenAI, OpenRouter). 0.0 means unreported (not free). """ data: dict = { "stop_reason": stop_reason, @@ -826,6 +830,7 @@ class EventBus: "output_tokens": output_tokens, "cached_tokens": cached_tokens, "cache_creation_tokens": cache_creation_tokens, + "cost_usd": cost_usd, } if iteration is not None: data["iteration"] = iteration diff --git a/core/framework/llm/litellm.py b/core/framework/llm/litellm.py index 7171fcac..16681e70 100644 --- a/core/framework/llm/litellm.py +++ b/core/framework/llm/litellm.py @@ -360,6 +360,82 @@ FAILED_REQUESTS_DIR = Path.home() / ".hive" / "failed_requests" MAX_FAILED_REQUEST_DUMPS = 50 +def _extract_cost(response: Any, model: str) -> float: + """Pull the USD cost for a non-streaming completion response. + + Sources checked, in priority order: + 1. ``usage.cost`` — populated when OpenRouter returns native cost via + ``usage: {include: true}`` or when ``litellm.include_cost_in_streaming_usage`` + is on. + 2. ``response._hidden_params["response_cost"]`` — set by LiteLLM's + logging layer after most successful completions. + 3. ``litellm.completion_cost(...)`` — computes from the model pricing + table; works across Anthropic, OpenAI, and OpenRouter as long as the + model is in LiteLLM's catalog. + + Returns 0.0 for unpriced models or unexpected response shapes — cost is a + display concern, never let it break the hot path. For streaming paths + where the aggregate response isn't a full ``ModelResponse``, use + :func:`_cost_from_tokens` with the already-extracted token counts. + """ + if response is None: + return 0.0 + usage = getattr(response, "usage", None) + usage_cost = getattr(usage, "cost", None) if usage is not None else None + if isinstance(usage_cost, (int, float)) and usage_cost > 0: + return float(usage_cost) + + hidden = getattr(response, "_hidden_params", None) + if isinstance(hidden, dict): + hp_cost = hidden.get("response_cost") + if isinstance(hp_cost, (int, float)) and hp_cost > 0: + return float(hp_cost) + + try: + import litellm as _litellm + + computed = _litellm.completion_cost(completion_response=response, model=model) + if isinstance(computed, (int, float)) and computed > 0: + return float(computed) + except Exception as exc: + logger.debug("[cost] completion_cost failed for %s: %s", model, exc) + return 0.0 + + +def _cost_from_tokens( + model: str, + input_tokens: int, + output_tokens: int, + cached_tokens: int = 0, + cache_creation_tokens: int = 0, +) -> float: + """Compute USD cost from already-normalized token counts. + + Used on streaming paths where the aggregate ``response`` is the stream + wrapper (not a full ``ModelResponse``) and ``litellm.completion_cost`` on + it either no-ops or raises. Calls ``litellm.cost_per_token`` directly + with the cache-aware inputs so Anthropic's 5-min-write / cache-read + multipliers are applied correctly. + """ + if not model or (input_tokens == 0 and output_tokens == 0): + return 0.0 + try: + import litellm as _litellm + + prompt_cost, completion_cost = _litellm.cost_per_token( + model=model, + prompt_tokens=input_tokens, + completion_tokens=output_tokens, + cache_read_input_tokens=cached_tokens, + cache_creation_input_tokens=cache_creation_tokens, + ) + total = (prompt_cost or 0.0) + (completion_cost or 0.0) + return float(total) if total > 0 else 0.0 + except Exception as exc: + logger.debug("[cost] cost_per_token failed for %s: %s", model, exc) + return 0.0 + + def _extract_cache_tokens(usage: Any) -> tuple[int, int]: """Pull (cache_read, cache_creation) from a LiteLLM usage object. @@ -1115,6 +1191,7 @@ class LiteLLMProvider(LLMProvider): input_tokens = usage.prompt_tokens if usage else 0 output_tokens = usage.completion_tokens if usage else 0 cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage) + cost_usd = _extract_cost(response, self.model) return LLMResponse( content=content, @@ -1123,6 +1200,7 @@ class LiteLLMProvider(LLMProvider): output_tokens=output_tokens, cached_tokens=cached_tokens, cache_creation_tokens=cache_creation_tokens, + cost_usd=cost_usd, stop_reason=response.choices[0].finish_reason or "", raw_response=response, ) @@ -1338,6 +1416,7 @@ class LiteLLMProvider(LLMProvider): input_tokens = usage.prompt_tokens if usage else 0 output_tokens = usage.completion_tokens if usage else 0 cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage) + cost_usd = _extract_cost(response, self.model) return LLMResponse( content=content, @@ -1346,6 +1425,7 @@ class LiteLLMProvider(LLMProvider): output_tokens=output_tokens, cached_tokens=cached_tokens, cache_creation_tokens=cache_creation_tokens, + cost_usd=cost_usd, stop_reason=response.choices[0].finish_reason or "", raw_response=response, ) @@ -1821,6 +1901,7 @@ class LiteLLMProvider(LLMProvider): input_tokens = usage.prompt_tokens if usage else 0 output_tokens = usage.completion_tokens if usage else 0 cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage) + cost_usd = _extract_cost(response, self.model) stop_reason = "tool_calls" if tool_calls else (response.choices[0].finish_reason or "stop") return LLMResponse( @@ -1830,6 +1911,7 @@ class LiteLLMProvider(LLMProvider): output_tokens=output_tokens, cached_tokens=cached_tokens, cache_creation_tokens=cache_creation_tokens, + cost_usd=cost_usd, stop_reason=stop_reason, raw_response={ "compat_mode": "openrouter_tool_emulation", @@ -1891,6 +1973,7 @@ class LiteLLMProvider(LLMProvider): output_tokens=response.output_tokens, cached_tokens=response.cached_tokens, cache_creation_tokens=response.cache_creation_tokens, + cost_usd=response.cost_usd, model=response.model, ) @@ -1960,6 +2043,7 @@ class LiteLLMProvider(LLMProvider): output_tokens=response.output_tokens, cached_tokens=response.cached_tokens, cache_creation_tokens=response.cache_creation_tokens, + cost_usd=response.cost_usd, model=response.model, ) @@ -2286,6 +2370,13 @@ class LiteLLMProvider(LLMProvider): choice.finish_reason, self.model, ) + cost_usd = _cost_from_tokens( + self.model, + input_tokens, + output_tokens, + cached_tokens, + cache_creation_tokens, + ) tail_events.append( FinishEvent( stop_reason=choice.finish_reason, @@ -2293,6 +2384,7 @@ class LiteLLMProvider(LLMProvider): output_tokens=output_tokens, cached_tokens=cached_tokens, cache_creation_tokens=cache_creation_tokens, + cost_usd=cost_usd, model=self.model, ) ) @@ -2335,6 +2427,13 @@ class LiteLLMProvider(LLMProvider): cache_creation_tokens, self.model, ) + cost_usd = _cost_from_tokens( + self.model, + input_tokens, + output_tokens, + cached_tokens, + cache_creation_tokens, + ) # Patch the FinishEvent already queued with 0 tokens for _i, _ev in enumerate(tail_events): if isinstance(_ev, FinishEvent) and _ev.input_tokens == 0: @@ -2344,6 +2443,7 @@ class LiteLLMProvider(LLMProvider): output_tokens=output_tokens, cached_tokens=cached_tokens, cache_creation_tokens=cache_creation_tokens, + cost_usd=cost_usd, model=_ev.model, ) break diff --git a/core/framework/llm/provider.py b/core/framework/llm/provider.py index 5a95d06d..30af14d8 100644 --- a/core/framework/llm/provider.py +++ b/core/framework/llm/provider.py @@ -15,6 +15,10 @@ class LLMResponse: ``cached_tokens`` and ``cache_creation_tokens`` are subsets of ``input_tokens`` (providers report them inside ``prompt_tokens``). Surface them for visibility; do not add to a total. + + ``cost_usd`` is the per-call USD cost when the provider / pricing table + can produce one (Anthropic, OpenAI, OpenRouter are supported). 0.0 when + unknown or unpriced — treat as "unreported", not "free". """ content: str @@ -23,6 +27,7 @@ class LLMResponse: output_tokens: int = 0 cached_tokens: int = 0 cache_creation_tokens: int = 0 + cost_usd: float = 0.0 stop_reason: str = "" raw_response: Any = None @@ -189,6 +194,7 @@ class LLMProvider(ABC): output_tokens=response.output_tokens, cached_tokens=response.cached_tokens, cache_creation_tokens=response.cache_creation_tokens, + cost_usd=response.cost_usd, model=response.model, ) diff --git a/core/framework/llm/stream_events.py b/core/framework/llm/stream_events.py index 5e7c31a7..74c16f85 100644 --- a/core/framework/llm/stream_events.py +++ b/core/framework/llm/stream_events.py @@ -70,6 +70,9 @@ class FinishEvent: ``cached_tokens`` and ``cache_creation_tokens`` are subsets of ``input_tokens`` — providers count both inside ``prompt_tokens`` already. Surface them separately for visibility; never add to a total. + + ``cost_usd`` is the per-turn USD cost when the provider or LiteLLM's + pricing table supplies one; 0.0 means unreported (not free). """ type: Literal["finish"] = "finish" @@ -78,6 +81,7 @@ class FinishEvent: output_tokens: int = 0 cached_tokens: int = 0 cache_creation_tokens: int = 0 + cost_usd: float = 0.0 model: str = "" diff --git a/core/frontend/src/components/ChatPanel.tsx b/core/frontend/src/components/ChatPanel.tsx index 128ec875..10738e93 100644 --- a/core/frontend/src/components/ChatPanel.tsx +++ b/core/frontend/src/components/ChatPanel.tsx @@ -155,7 +155,7 @@ interface ChatPanelProps { * `cached` (cache reads) and `cacheCreated` (cache writes) are subsets of * `input` — providers count both inside prompt_tokens. Display them * separately; do not add to a total. */ - tokenUsage?: { input: number; output: number; cached?: number; cacheCreated?: number }; + tokenUsage?: { input: number; output: number; cached?: number; cacheCreated?: number; costUsd?: number }; /** Optional action element rendered on the right side of the "Conversation" header */ headerAction?: React.ReactNode; } @@ -1488,17 +1488,35 @@ export default function ChatPanel({ {hasTokens && (() => { const cached = tokenUsage!.cached ?? 0; const created = tokenUsage!.cacheCreated ?? 0; + const cost = tokenUsage!.costUsd ?? 0; // cached/created are subsets of input — never sum; surface separately. - const title = [ - "LLM tokens used this session", - `input ${fmt(tokenUsage!.input)}`, - ` cache read ${fmt(cached)}`, - ` cache write ${fmt(created)}`, - `output ${fmt(tokenUsage!.output)}`, - ].join("\n"); + // Cost can be < $0.01; show 4 decimals so small-model sessions aren't "$0.00". + const costStr = cost > 0 ? `$${cost.toFixed(4)}` : "—"; return ( - + Tokens: {fmt(tokenUsage!.output)} + + + LLM tokens used this session + + + Input + {fmt(tokenUsage!.input)} + cache read + {fmt(cached)} + cache write + {fmt(created)} + Output + {fmt(tokenUsage!.output)} + Cost + + {costStr} + + + ); })()} diff --git a/core/frontend/src/pages/queen-dm.tsx b/core/frontend/src/pages/queen-dm.tsx index 9f767e61..dd26690e 100644 --- a/core/frontend/src/pages/queen-dm.tsx +++ b/core/frontend/src/pages/queen-dm.tsx @@ -73,11 +73,14 @@ export default function QueenDM() { const [awaitingInput, setAwaitingInput] = useState(false); // `cached` and `cacheCreated` are subsets of `input` (providers count both // inside prompt_tokens already) — display them, never add them to a total. + // `costUsd` is the session-total USD cost when the provider supplies one + // (Anthropic, OpenAI, OpenRouter); 0 means unreported, not free. const [tokenUsage, setTokenUsage] = useState({ input: 0, output: 0, cached: 0, cacheCreated: 0, + costUsd: 0, }); const [historySessions, setHistorySessions] = useState([]); const [historyLoading, setHistoryLoading] = useState(false); @@ -125,7 +128,7 @@ export default function QueenDM() { setPendingQuestions(null); setAwaitingInput(false); setQueenPhase("independent"); - setTokenUsage({ input: 0, output: 0, cached: 0, cacheCreated: 0 }); + setTokenUsage({ input: 0, output: 0, cached: 0, cacheCreated: 0, costUsd: 0 }); setInitialDraft(null); setColonySpawned(false); setSpawnedColonyName(null); @@ -587,11 +590,13 @@ export default function QueenDM() { // separately for display, do NOT roll into input/total. const cached = (event.data.cached_tokens as number) || 0; const cacheCreated = (event.data.cache_creation_tokens as number) || 0; + const costUsd = (event.data.cost_usd as number) || 0; setTokenUsage((prev) => ({ input: prev.input + inp, output: prev.output + out, cached: prev.cached + cached, cacheCreated: prev.cacheCreated + cacheCreated, + costUsd: prev.costUsd + costUsd, })); } // Flush one queued message per LLM turn boundary. This is the diff --git a/core/tests/test_event_bus.py b/core/tests/test_event_bus.py index 5e5facb0..b10f334f 100644 --- a/core/tests/test_event_bus.py +++ b/core/tests/test_event_bus.py @@ -814,6 +814,7 @@ class TestConveniencePublishers: output_tokens=50, cached_tokens=30, cache_creation_tokens=10, + cost_usd=0.0042, execution_id="exec_1", iteration=3, ) @@ -828,6 +829,7 @@ class TestConveniencePublishers: # display, NOT additive to input_tokens. assert received[0].data["cached_tokens"] == 30 assert received[0].data["cache_creation_tokens"] == 10 + assert received[0].data["cost_usd"] == 0.0042 assert received[0].data["iteration"] == 3 @pytest.mark.asyncio diff --git a/core/tests/test_litellm_provider.py b/core/tests/test_litellm_provider.py index 18ab1123..9f795d84 100644 --- a/core/tests/test_litellm_provider.py +++ b/core/tests/test_litellm_provider.py @@ -25,8 +25,10 @@ from framework.llm.litellm import ( LiteLLMProvider, _build_system_message, _compute_retry_delay, + _cost_from_tokens, _ensure_ollama_chat_prefix, _extract_cache_tokens, + _extract_cost, _is_ollama_model, _model_supports_cache_control, _summarize_request_for_log, @@ -1512,3 +1514,124 @@ class TestStreamingChunksFallbackPreservesCacheFields: assert cached == 5601 assert creation == 0 + + +class TestExtractCost: + """`_extract_cost` pulls USD cost from three sources in order: + usage.cost (OpenRouter native / include_cost_in_streaming_usage) → + response._hidden_params['response_cost'] (LiteLLM logging) → + litellm.completion_cost() (pricing-table fallback).""" + + def test_none_response_returns_zero(self): + assert _extract_cost(None, "gpt-4o-mini") == 0.0 + + def test_openrouter_usage_cost_is_preferred(self): + """OpenRouter returns authoritative per-call cost on usage.cost when + the caller opts in (usage.include=true). That beats LiteLLM's + pricing-table estimate because it reflects promo pricing and BYOK markup.""" + response = MagicMock() + response.usage = MagicMock(cost=0.00123) + response._hidden_params = {"response_cost": 99.99} # should be ignored + assert _extract_cost(response, "openrouter/anthropic/claude-opus-4.5") == 0.00123 + + def test_hidden_params_response_cost_used_when_no_usage_cost(self): + """LiteLLM's logging layer attaches response_cost after most + completions — this is how OpenAI/Anthropic responses get costed + without going back to the pricing table.""" + response = MagicMock() + response.usage = MagicMock(spec=[]) # no .cost attribute + response._hidden_params = {"response_cost": 0.0042} + assert _extract_cost(response, "gpt-4o-mini") == 0.0042 + + def test_falls_back_to_completion_cost_when_nothing_pre_populated(self): + """For providers where LiteLLM didn't pre-populate cost, call + litellm.completion_cost() against the pricing table. Mocked here + because we don't want tests depending on the exact price of + claude-sonnet-4.5 in LiteLLM's model map.""" + response = MagicMock() + response.usage = MagicMock(spec=[]) + response._hidden_params = {} + with patch("litellm.completion_cost", return_value=0.00789): + assert _extract_cost(response, "anthropic/claude-sonnet-4.5") == 0.00789 + + def test_completion_cost_exception_returns_zero(self): + """Unpriced models (e.g. new OpenRouter routes not yet in LiteLLM's + catalog) must not crash the hot path.""" + response = MagicMock() + response.usage = MagicMock(spec=[]) + response._hidden_params = {} + with patch("litellm.completion_cost", side_effect=Exception("no pricing")): + assert _extract_cost(response, "openrouter/mystery/model") == 0.0 + + def test_zero_cost_falls_through_to_next_source(self): + """usage.cost == 0 should NOT short-circuit; fall through to + _hidden_params / completion_cost so we don't cement a false zero.""" + response = MagicMock() + response.usage = MagicMock(cost=0.0) + response._hidden_params = {"response_cost": 0.0055} + assert _extract_cost(response, "gpt-4o-mini") == 0.0055 + + +class TestCostFromTokens: + """`_cost_from_tokens` is the streaming-path cost helper: stream wrappers + don't expose the full ModelResponse shape that completion_cost() expects, + so we go through cost_per_token() with the already-extracted totals.""" + + def test_zero_tokens_returns_zero_without_calling_litellm(self): + with patch("litellm.cost_per_token") as mock: + assert _cost_from_tokens("claude-opus-4.5", 0, 0) == 0.0 + mock.assert_not_called() + + def test_empty_model_returns_zero(self): + assert _cost_from_tokens("", 1000, 500) == 0.0 + + def test_computes_from_tokens(self): + with patch("litellm.cost_per_token", return_value=(0.001, 0.002)) as mock: + cost = _cost_from_tokens( + "anthropic/claude-opus-4.5", + input_tokens=1000, + output_tokens=500, + cached_tokens=200, + cache_creation_tokens=100, + ) + assert cost == pytest.approx(0.003) + # Verify the cache-aware kwargs are threaded through — Anthropic + # needs these to apply the 1.25x write / 0.1x read multipliers. + call_kwargs = mock.call_args.kwargs + assert call_kwargs["prompt_tokens"] == 1000 + assert call_kwargs["completion_tokens"] == 500 + assert call_kwargs["cache_read_input_tokens"] == 200 + assert call_kwargs["cache_creation_input_tokens"] == 100 + + def test_exception_returns_zero(self): + with patch("litellm.cost_per_token", side_effect=Exception("unpriced")): + assert _cost_from_tokens("mystery/model", 1000, 500) == 0.0 + + def test_negative_or_none_components_coerce_to_zero(self): + """LiteLLM returns (None, None) for unknown models in some versions; + treat as 0 rather than crashing on None+None.""" + with patch("litellm.cost_per_token", return_value=(None, None)): + assert _cost_from_tokens("some/model", 1, 1) == 0.0 + + +class TestLLMResponseAndFinishEventHaveCostUsd: + """Regression: both LLMResponse and FinishEvent must carry cost_usd so + the agent loop → event bus → frontend pipeline doesn't lose cost.""" + + def test_llm_response_defaults_cost_to_zero(self): + from framework.llm.provider import LLMResponse + + r = LLMResponse(content="", model="m") + assert r.cost_usd == 0.0 + + def test_finish_event_defaults_cost_to_zero(self): + from framework.llm.stream_events import FinishEvent + + e = FinishEvent() + assert e.cost_usd == 0.0 + + def test_finish_event_accepts_cost(self): + from framework.llm.stream_events import FinishEvent + + e = FinishEvent(cost_usd=0.0123) + assert e.cost_usd == 0.0123 diff --git a/core/tests/test_stream_events.py b/core/tests/test_stream_events.py index 105097ed..a8c0cadb 100644 --- a/core/tests/test_stream_events.py +++ b/core/tests/test_stream_events.py @@ -242,6 +242,7 @@ class TestEventSerialization: "output_tokens": 20, "cached_tokens": 0, "cache_creation_tokens": 0, + "cost_usd": 0.0, "model": "gpt-4", }