feat: cost tracking

This commit is contained in:
Richard Tang
2026-04-23 15:34:07 -07:00
parent 8c6428f445
commit 77cc169606
11 changed files with 283 additions and 11 deletions
+7 -1
View File
@@ -940,6 +940,7 @@ class AgentLoop(AgentProtocol):
output_tokens=turn_tokens.get("output", 0),
cached_tokens=turn_tokens.get("cached", 0),
cache_creation_tokens=turn_tokens.get("cache_creation", 0),
cost_usd=float(turn_tokens.get("cost", 0.0) or 0.0),
execution_id=execution_id,
iteration=iteration,
)
@@ -2340,7 +2341,9 @@ class AgentLoop(AgentProtocol):
stream_id = ctx.stream_id or ctx.agent_id
node_id = ctx.agent_id
execution_id = ctx.execution_id or ""
token_counts: dict[str, int] = {"input": 0, "output": 0, "cached": 0, "cache_creation": 0}
# Mixed-type dict: int token counts + str stop_reason/model + float cost.
# Typed loosely to avoid churn in the many call sites that read from it.
token_counts: dict[str, Any] = {"input": 0, "output": 0, "cached": 0, "cache_creation": 0, "cost": 0.0}
tool_call_count = 0
final_text = ""
final_system_prompt = conversation.system_prompt
@@ -2572,6 +2575,7 @@ class AgentLoop(AgentProtocol):
token_counts["output"] += event.output_tokens
token_counts["cached"] += event.cached_tokens
token_counts["cache_creation"] += event.cache_creation_tokens
token_counts["cost"] = token_counts.get("cost", 0.0) + event.cost_usd
token_counts["stop_reason"] = event.stop_reason
token_counts["model"] = event.model
@@ -4154,6 +4158,7 @@ class AgentLoop(AgentProtocol):
output_tokens: int,
cached_tokens: int = 0,
cache_creation_tokens: int = 0,
cost_usd: float = 0.0,
execution_id: str = "",
iteration: int | None = None,
) -> None:
@@ -4167,6 +4172,7 @@ class AgentLoop(AgentProtocol):
output_tokens=output_tokens,
cached_tokens=cached_tokens,
cache_creation_tokens=cache_creation_tokens,
cost_usd=cost_usd,
execution_id=execution_id,
iteration=iteration,
)
@@ -109,6 +109,7 @@ async def publish_llm_turn_complete(
output_tokens: int,
cached_tokens: int = 0,
cache_creation_tokens: int = 0,
cost_usd: float = 0.0,
execution_id: str = "",
iteration: int | None = None,
) -> None:
@@ -122,6 +123,7 @@ async def publish_llm_turn_complete(
output_tokens=output_tokens,
cached_tokens=cached_tokens,
cache_creation_tokens=cache_creation_tokens,
cost_usd=cost_usd,
execution_id=execution_id,
iteration=iteration,
)
+5
View File
@@ -810,6 +810,7 @@ class EventBus:
output_tokens: int,
cached_tokens: int = 0,
cache_creation_tokens: int = 0,
cost_usd: float = 0.0,
execution_id: str | None = None,
iteration: int | None = None,
) -> None:
@@ -818,6 +819,9 @@ class EventBus:
``cached_tokens`` and ``cache_creation_tokens`` are subsets of
``input_tokens`` (already inside provider ``prompt_tokens``).
Subscribers should display them, not add them to a total.
``cost_usd`` is the USD cost for this turn when known (Anthropic,
OpenAI, OpenRouter). 0.0 means unreported (not free).
"""
data: dict = {
"stop_reason": stop_reason,
@@ -826,6 +830,7 @@ class EventBus:
"output_tokens": output_tokens,
"cached_tokens": cached_tokens,
"cache_creation_tokens": cache_creation_tokens,
"cost_usd": cost_usd,
}
if iteration is not None:
data["iteration"] = iteration
+100
View File
@@ -360,6 +360,82 @@ FAILED_REQUESTS_DIR = Path.home() / ".hive" / "failed_requests"
MAX_FAILED_REQUEST_DUMPS = 50
def _extract_cost(response: Any, model: str) -> float:
"""Pull the USD cost for a non-streaming completion response.
Sources checked, in priority order:
1. ``usage.cost`` populated when OpenRouter returns native cost via
``usage: {include: true}`` or when ``litellm.include_cost_in_streaming_usage``
is on.
2. ``response._hidden_params["response_cost"]`` set by LiteLLM's
logging layer after most successful completions.
3. ``litellm.completion_cost(...)`` computes from the model pricing
table; works across Anthropic, OpenAI, and OpenRouter as long as the
model is in LiteLLM's catalog.
Returns 0.0 for unpriced models or unexpected response shapes cost is a
display concern, never let it break the hot path. For streaming paths
where the aggregate response isn't a full ``ModelResponse``, use
:func:`_cost_from_tokens` with the already-extracted token counts.
"""
if response is None:
return 0.0
usage = getattr(response, "usage", None)
usage_cost = getattr(usage, "cost", None) if usage is not None else None
if isinstance(usage_cost, (int, float)) and usage_cost > 0:
return float(usage_cost)
hidden = getattr(response, "_hidden_params", None)
if isinstance(hidden, dict):
hp_cost = hidden.get("response_cost")
if isinstance(hp_cost, (int, float)) and hp_cost > 0:
return float(hp_cost)
try:
import litellm as _litellm
computed = _litellm.completion_cost(completion_response=response, model=model)
if isinstance(computed, (int, float)) and computed > 0:
return float(computed)
except Exception as exc:
logger.debug("[cost] completion_cost failed for %s: %s", model, exc)
return 0.0
def _cost_from_tokens(
model: str,
input_tokens: int,
output_tokens: int,
cached_tokens: int = 0,
cache_creation_tokens: int = 0,
) -> float:
"""Compute USD cost from already-normalized token counts.
Used on streaming paths where the aggregate ``response`` is the stream
wrapper (not a full ``ModelResponse``) and ``litellm.completion_cost`` on
it either no-ops or raises. Calls ``litellm.cost_per_token`` directly
with the cache-aware inputs so Anthropic's 5-min-write / cache-read
multipliers are applied correctly.
"""
if not model or (input_tokens == 0 and output_tokens == 0):
return 0.0
try:
import litellm as _litellm
prompt_cost, completion_cost = _litellm.cost_per_token(
model=model,
prompt_tokens=input_tokens,
completion_tokens=output_tokens,
cache_read_input_tokens=cached_tokens,
cache_creation_input_tokens=cache_creation_tokens,
)
total = (prompt_cost or 0.0) + (completion_cost or 0.0)
return float(total) if total > 0 else 0.0
except Exception as exc:
logger.debug("[cost] cost_per_token failed for %s: %s", model, exc)
return 0.0
def _extract_cache_tokens(usage: Any) -> tuple[int, int]:
"""Pull (cache_read, cache_creation) from a LiteLLM usage object.
@@ -1115,6 +1191,7 @@ class LiteLLMProvider(LLMProvider):
input_tokens = usage.prompt_tokens if usage else 0
output_tokens = usage.completion_tokens if usage else 0
cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
cost_usd = _extract_cost(response, self.model)
return LLMResponse(
content=content,
@@ -1123,6 +1200,7 @@ class LiteLLMProvider(LLMProvider):
output_tokens=output_tokens,
cached_tokens=cached_tokens,
cache_creation_tokens=cache_creation_tokens,
cost_usd=cost_usd,
stop_reason=response.choices[0].finish_reason or "",
raw_response=response,
)
@@ -1338,6 +1416,7 @@ class LiteLLMProvider(LLMProvider):
input_tokens = usage.prompt_tokens if usage else 0
output_tokens = usage.completion_tokens if usage else 0
cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
cost_usd = _extract_cost(response, self.model)
return LLMResponse(
content=content,
@@ -1346,6 +1425,7 @@ class LiteLLMProvider(LLMProvider):
output_tokens=output_tokens,
cached_tokens=cached_tokens,
cache_creation_tokens=cache_creation_tokens,
cost_usd=cost_usd,
stop_reason=response.choices[0].finish_reason or "",
raw_response=response,
)
@@ -1821,6 +1901,7 @@ class LiteLLMProvider(LLMProvider):
input_tokens = usage.prompt_tokens if usage else 0
output_tokens = usage.completion_tokens if usage else 0
cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
cost_usd = _extract_cost(response, self.model)
stop_reason = "tool_calls" if tool_calls else (response.choices[0].finish_reason or "stop")
return LLMResponse(
@@ -1830,6 +1911,7 @@ class LiteLLMProvider(LLMProvider):
output_tokens=output_tokens,
cached_tokens=cached_tokens,
cache_creation_tokens=cache_creation_tokens,
cost_usd=cost_usd,
stop_reason=stop_reason,
raw_response={
"compat_mode": "openrouter_tool_emulation",
@@ -1891,6 +1973,7 @@ class LiteLLMProvider(LLMProvider):
output_tokens=response.output_tokens,
cached_tokens=response.cached_tokens,
cache_creation_tokens=response.cache_creation_tokens,
cost_usd=response.cost_usd,
model=response.model,
)
@@ -1960,6 +2043,7 @@ class LiteLLMProvider(LLMProvider):
output_tokens=response.output_tokens,
cached_tokens=response.cached_tokens,
cache_creation_tokens=response.cache_creation_tokens,
cost_usd=response.cost_usd,
model=response.model,
)
@@ -2286,6 +2370,13 @@ class LiteLLMProvider(LLMProvider):
choice.finish_reason,
self.model,
)
cost_usd = _cost_from_tokens(
self.model,
input_tokens,
output_tokens,
cached_tokens,
cache_creation_tokens,
)
tail_events.append(
FinishEvent(
stop_reason=choice.finish_reason,
@@ -2293,6 +2384,7 @@ class LiteLLMProvider(LLMProvider):
output_tokens=output_tokens,
cached_tokens=cached_tokens,
cache_creation_tokens=cache_creation_tokens,
cost_usd=cost_usd,
model=self.model,
)
)
@@ -2335,6 +2427,13 @@ class LiteLLMProvider(LLMProvider):
cache_creation_tokens,
self.model,
)
cost_usd = _cost_from_tokens(
self.model,
input_tokens,
output_tokens,
cached_tokens,
cache_creation_tokens,
)
# Patch the FinishEvent already queued with 0 tokens
for _i, _ev in enumerate(tail_events):
if isinstance(_ev, FinishEvent) and _ev.input_tokens == 0:
@@ -2344,6 +2443,7 @@ class LiteLLMProvider(LLMProvider):
output_tokens=output_tokens,
cached_tokens=cached_tokens,
cache_creation_tokens=cache_creation_tokens,
cost_usd=cost_usd,
model=_ev.model,
)
break
+6
View File
@@ -15,6 +15,10 @@ class LLMResponse:
``cached_tokens`` and ``cache_creation_tokens`` are subsets of
``input_tokens`` (providers report them inside ``prompt_tokens``).
Surface them for visibility; do not add to a total.
``cost_usd`` is the per-call USD cost when the provider / pricing table
can produce one (Anthropic, OpenAI, OpenRouter are supported). 0.0 when
unknown or unpriced treat as "unreported", not "free".
"""
content: str
@@ -23,6 +27,7 @@ class LLMResponse:
output_tokens: int = 0
cached_tokens: int = 0
cache_creation_tokens: int = 0
cost_usd: float = 0.0
stop_reason: str = ""
raw_response: Any = None
@@ -189,6 +194,7 @@ class LLMProvider(ABC):
output_tokens=response.output_tokens,
cached_tokens=response.cached_tokens,
cache_creation_tokens=response.cache_creation_tokens,
cost_usd=response.cost_usd,
model=response.model,
)
+4
View File
@@ -70,6 +70,9 @@ class FinishEvent:
``cached_tokens`` and ``cache_creation_tokens`` are subsets of
``input_tokens`` providers count both inside ``prompt_tokens`` already.
Surface them separately for visibility; never add to a total.
``cost_usd`` is the per-turn USD cost when the provider or LiteLLM's
pricing table supplies one; 0.0 means unreported (not free).
"""
type: Literal["finish"] = "finish"
@@ -78,6 +81,7 @@ class FinishEvent:
output_tokens: int = 0
cached_tokens: int = 0
cache_creation_tokens: int = 0
cost_usd: float = 0.0
model: str = ""
+27 -9
View File
@@ -155,7 +155,7 @@ interface ChatPanelProps {
* `cached` (cache reads) and `cacheCreated` (cache writes) are subsets of
* `input` — providers count both inside prompt_tokens. Display them
* separately; do not add to a total. */
tokenUsage?: { input: number; output: number; cached?: number; cacheCreated?: number };
tokenUsage?: { input: number; output: number; cached?: number; cacheCreated?: number; costUsd?: number };
/** Optional action element rendered on the right side of the "Conversation" header */
headerAction?: React.ReactNode;
}
@@ -1488,17 +1488,35 @@ export default function ChatPanel({
{hasTokens && (() => {
const cached = tokenUsage!.cached ?? 0;
const created = tokenUsage!.cacheCreated ?? 0;
const cost = tokenUsage!.costUsd ?? 0;
// cached/created are subsets of input — never sum; surface separately.
const title = [
"LLM tokens used this session",
`input ${fmt(tokenUsage!.input)}`,
` cache read ${fmt(cached)}`,
` cache write ${fmt(created)}`,
`output ${fmt(tokenUsage!.output)}`,
].join("\n");
// Cost can be < $0.01; show 4 decimals so small-model sessions aren't "$0.00".
const costStr = cost > 0 ? `$${cost.toFixed(4)}` : "—";
return (
<span title={title}>
<span className="group relative cursor-help transition-colors hover:text-muted-foreground">
Tokens: {fmt(tokenUsage!.output)}
<span
role="tooltip"
className="pointer-events-none invisible absolute bottom-full right-0 z-50 mb-2 whitespace-nowrap rounded-md border border-border bg-popover px-3 py-2 text-[11px] text-popover-foreground opacity-0 shadow-lg transition-[opacity,transform] duration-150 translate-y-1 group-hover:visible group-hover:opacity-100 group-hover:translate-y-0"
>
<span className="mb-1.5 block text-muted-foreground">
LLM tokens used this session
</span>
<span className="grid grid-cols-[auto_1fr] gap-x-4 gap-y-0.5 tabular-nums">
<span>Input</span>
<span className="text-right">{fmt(tokenUsage!.input)}</span>
<span className="pl-3 text-muted-foreground">cache read</span>
<span className="text-right text-muted-foreground">{fmt(cached)}</span>
<span className="pl-3 text-muted-foreground">cache write</span>
<span className="text-right text-muted-foreground">{fmt(created)}</span>
<span>Output</span>
<span className="text-right">{fmt(tokenUsage!.output)}</span>
<span className="mt-1 border-t border-border/50 pt-1">Cost</span>
<span className="mt-1 border-t border-border/50 pt-1 text-right font-medium">
{costStr}
</span>
</span>
</span>
</span>
);
})()}
+6 -1
View File
@@ -73,11 +73,14 @@ export default function QueenDM() {
const [awaitingInput, setAwaitingInput] = useState(false);
// `cached` and `cacheCreated` are subsets of `input` (providers count both
// inside prompt_tokens already) — display them, never add them to a total.
// `costUsd` is the session-total USD cost when the provider supplies one
// (Anthropic, OpenAI, OpenRouter); 0 means unreported, not free.
const [tokenUsage, setTokenUsage] = useState({
input: 0,
output: 0,
cached: 0,
cacheCreated: 0,
costUsd: 0,
});
const [historySessions, setHistorySessions] = useState<HistorySession[]>([]);
const [historyLoading, setHistoryLoading] = useState(false);
@@ -125,7 +128,7 @@ export default function QueenDM() {
setPendingQuestions(null);
setAwaitingInput(false);
setQueenPhase("independent");
setTokenUsage({ input: 0, output: 0, cached: 0, cacheCreated: 0 });
setTokenUsage({ input: 0, output: 0, cached: 0, cacheCreated: 0, costUsd: 0 });
setInitialDraft(null);
setColonySpawned(false);
setSpawnedColonyName(null);
@@ -587,11 +590,13 @@ export default function QueenDM() {
// separately for display, do NOT roll into input/total.
const cached = (event.data.cached_tokens as number) || 0;
const cacheCreated = (event.data.cache_creation_tokens as number) || 0;
const costUsd = (event.data.cost_usd as number) || 0;
setTokenUsage((prev) => ({
input: prev.input + inp,
output: prev.output + out,
cached: prev.cached + cached,
cacheCreated: prev.cacheCreated + cacheCreated,
costUsd: prev.costUsd + costUsd,
}));
}
// Flush one queued message per LLM turn boundary. This is the
+2
View File
@@ -814,6 +814,7 @@ class TestConveniencePublishers:
output_tokens=50,
cached_tokens=30,
cache_creation_tokens=10,
cost_usd=0.0042,
execution_id="exec_1",
iteration=3,
)
@@ -828,6 +829,7 @@ class TestConveniencePublishers:
# display, NOT additive to input_tokens.
assert received[0].data["cached_tokens"] == 30
assert received[0].data["cache_creation_tokens"] == 10
assert received[0].data["cost_usd"] == 0.0042
assert received[0].data["iteration"] == 3
@pytest.mark.asyncio
+123
View File
@@ -25,8 +25,10 @@ from framework.llm.litellm import (
LiteLLMProvider,
_build_system_message,
_compute_retry_delay,
_cost_from_tokens,
_ensure_ollama_chat_prefix,
_extract_cache_tokens,
_extract_cost,
_is_ollama_model,
_model_supports_cache_control,
_summarize_request_for_log,
@@ -1512,3 +1514,124 @@ class TestStreamingChunksFallbackPreservesCacheFields:
assert cached == 5601
assert creation == 0
class TestExtractCost:
"""`_extract_cost` pulls USD cost from three sources in order:
usage.cost (OpenRouter native / include_cost_in_streaming_usage)
response._hidden_params['response_cost'] (LiteLLM logging)
litellm.completion_cost() (pricing-table fallback)."""
def test_none_response_returns_zero(self):
assert _extract_cost(None, "gpt-4o-mini") == 0.0
def test_openrouter_usage_cost_is_preferred(self):
"""OpenRouter returns authoritative per-call cost on usage.cost when
the caller opts in (usage.include=true). That beats LiteLLM's
pricing-table estimate because it reflects promo pricing and BYOK markup."""
response = MagicMock()
response.usage = MagicMock(cost=0.00123)
response._hidden_params = {"response_cost": 99.99} # should be ignored
assert _extract_cost(response, "openrouter/anthropic/claude-opus-4.5") == 0.00123
def test_hidden_params_response_cost_used_when_no_usage_cost(self):
"""LiteLLM's logging layer attaches response_cost after most
completions this is how OpenAI/Anthropic responses get costed
without going back to the pricing table."""
response = MagicMock()
response.usage = MagicMock(spec=[]) # no .cost attribute
response._hidden_params = {"response_cost": 0.0042}
assert _extract_cost(response, "gpt-4o-mini") == 0.0042
def test_falls_back_to_completion_cost_when_nothing_pre_populated(self):
"""For providers where LiteLLM didn't pre-populate cost, call
litellm.completion_cost() against the pricing table. Mocked here
because we don't want tests depending on the exact price of
claude-sonnet-4.5 in LiteLLM's model map."""
response = MagicMock()
response.usage = MagicMock(spec=[])
response._hidden_params = {}
with patch("litellm.completion_cost", return_value=0.00789):
assert _extract_cost(response, "anthropic/claude-sonnet-4.5") == 0.00789
def test_completion_cost_exception_returns_zero(self):
"""Unpriced models (e.g. new OpenRouter routes not yet in LiteLLM's
catalog) must not crash the hot path."""
response = MagicMock()
response.usage = MagicMock(spec=[])
response._hidden_params = {}
with patch("litellm.completion_cost", side_effect=Exception("no pricing")):
assert _extract_cost(response, "openrouter/mystery/model") == 0.0
def test_zero_cost_falls_through_to_next_source(self):
"""usage.cost == 0 should NOT short-circuit; fall through to
_hidden_params / completion_cost so we don't cement a false zero."""
response = MagicMock()
response.usage = MagicMock(cost=0.0)
response._hidden_params = {"response_cost": 0.0055}
assert _extract_cost(response, "gpt-4o-mini") == 0.0055
class TestCostFromTokens:
"""`_cost_from_tokens` is the streaming-path cost helper: stream wrappers
don't expose the full ModelResponse shape that completion_cost() expects,
so we go through cost_per_token() with the already-extracted totals."""
def test_zero_tokens_returns_zero_without_calling_litellm(self):
with patch("litellm.cost_per_token") as mock:
assert _cost_from_tokens("claude-opus-4.5", 0, 0) == 0.0
mock.assert_not_called()
def test_empty_model_returns_zero(self):
assert _cost_from_tokens("", 1000, 500) == 0.0
def test_computes_from_tokens(self):
with patch("litellm.cost_per_token", return_value=(0.001, 0.002)) as mock:
cost = _cost_from_tokens(
"anthropic/claude-opus-4.5",
input_tokens=1000,
output_tokens=500,
cached_tokens=200,
cache_creation_tokens=100,
)
assert cost == pytest.approx(0.003)
# Verify the cache-aware kwargs are threaded through — Anthropic
# needs these to apply the 1.25x write / 0.1x read multipliers.
call_kwargs = mock.call_args.kwargs
assert call_kwargs["prompt_tokens"] == 1000
assert call_kwargs["completion_tokens"] == 500
assert call_kwargs["cache_read_input_tokens"] == 200
assert call_kwargs["cache_creation_input_tokens"] == 100
def test_exception_returns_zero(self):
with patch("litellm.cost_per_token", side_effect=Exception("unpriced")):
assert _cost_from_tokens("mystery/model", 1000, 500) == 0.0
def test_negative_or_none_components_coerce_to_zero(self):
"""LiteLLM returns (None, None) for unknown models in some versions;
treat as 0 rather than crashing on None+None."""
with patch("litellm.cost_per_token", return_value=(None, None)):
assert _cost_from_tokens("some/model", 1, 1) == 0.0
class TestLLMResponseAndFinishEventHaveCostUsd:
"""Regression: both LLMResponse and FinishEvent must carry cost_usd so
the agent loop event bus frontend pipeline doesn't lose cost."""
def test_llm_response_defaults_cost_to_zero(self):
from framework.llm.provider import LLMResponse
r = LLMResponse(content="", model="m")
assert r.cost_usd == 0.0
def test_finish_event_defaults_cost_to_zero(self):
from framework.llm.stream_events import FinishEvent
e = FinishEvent()
assert e.cost_usd == 0.0
def test_finish_event_accepts_cost(self):
from framework.llm.stream_events import FinishEvent
e = FinishEvent(cost_usd=0.0123)
assert e.cost_usd == 0.0123
+1
View File
@@ -242,6 +242,7 @@ class TestEventSerialization:
"output_tokens": 20,
"cached_tokens": 0,
"cache_creation_tokens": 0,
"cost_usd": 0.0,
"model": "gpt-4",
}