feat: cost tracking
This commit is contained in:
@@ -940,6 +940,7 @@ class AgentLoop(AgentProtocol):
|
||||
output_tokens=turn_tokens.get("output", 0),
|
||||
cached_tokens=turn_tokens.get("cached", 0),
|
||||
cache_creation_tokens=turn_tokens.get("cache_creation", 0),
|
||||
cost_usd=float(turn_tokens.get("cost", 0.0) or 0.0),
|
||||
execution_id=execution_id,
|
||||
iteration=iteration,
|
||||
)
|
||||
@@ -2340,7 +2341,9 @@ class AgentLoop(AgentProtocol):
|
||||
stream_id = ctx.stream_id or ctx.agent_id
|
||||
node_id = ctx.agent_id
|
||||
execution_id = ctx.execution_id or ""
|
||||
token_counts: dict[str, int] = {"input": 0, "output": 0, "cached": 0, "cache_creation": 0}
|
||||
# Mixed-type dict: int token counts + str stop_reason/model + float cost.
|
||||
# Typed loosely to avoid churn in the many call sites that read from it.
|
||||
token_counts: dict[str, Any] = {"input": 0, "output": 0, "cached": 0, "cache_creation": 0, "cost": 0.0}
|
||||
tool_call_count = 0
|
||||
final_text = ""
|
||||
final_system_prompt = conversation.system_prompt
|
||||
@@ -2572,6 +2575,7 @@ class AgentLoop(AgentProtocol):
|
||||
token_counts["output"] += event.output_tokens
|
||||
token_counts["cached"] += event.cached_tokens
|
||||
token_counts["cache_creation"] += event.cache_creation_tokens
|
||||
token_counts["cost"] = token_counts.get("cost", 0.0) + event.cost_usd
|
||||
token_counts["stop_reason"] = event.stop_reason
|
||||
token_counts["model"] = event.model
|
||||
|
||||
@@ -4154,6 +4158,7 @@ class AgentLoop(AgentProtocol):
|
||||
output_tokens: int,
|
||||
cached_tokens: int = 0,
|
||||
cache_creation_tokens: int = 0,
|
||||
cost_usd: float = 0.0,
|
||||
execution_id: str = "",
|
||||
iteration: int | None = None,
|
||||
) -> None:
|
||||
@@ -4167,6 +4172,7 @@ class AgentLoop(AgentProtocol):
|
||||
output_tokens=output_tokens,
|
||||
cached_tokens=cached_tokens,
|
||||
cache_creation_tokens=cache_creation_tokens,
|
||||
cost_usd=cost_usd,
|
||||
execution_id=execution_id,
|
||||
iteration=iteration,
|
||||
)
|
||||
|
||||
@@ -109,6 +109,7 @@ async def publish_llm_turn_complete(
|
||||
output_tokens: int,
|
||||
cached_tokens: int = 0,
|
||||
cache_creation_tokens: int = 0,
|
||||
cost_usd: float = 0.0,
|
||||
execution_id: str = "",
|
||||
iteration: int | None = None,
|
||||
) -> None:
|
||||
@@ -122,6 +123,7 @@ async def publish_llm_turn_complete(
|
||||
output_tokens=output_tokens,
|
||||
cached_tokens=cached_tokens,
|
||||
cache_creation_tokens=cache_creation_tokens,
|
||||
cost_usd=cost_usd,
|
||||
execution_id=execution_id,
|
||||
iteration=iteration,
|
||||
)
|
||||
|
||||
@@ -810,6 +810,7 @@ class EventBus:
|
||||
output_tokens: int,
|
||||
cached_tokens: int = 0,
|
||||
cache_creation_tokens: int = 0,
|
||||
cost_usd: float = 0.0,
|
||||
execution_id: str | None = None,
|
||||
iteration: int | None = None,
|
||||
) -> None:
|
||||
@@ -818,6 +819,9 @@ class EventBus:
|
||||
``cached_tokens`` and ``cache_creation_tokens`` are subsets of
|
||||
``input_tokens`` (already inside provider ``prompt_tokens``).
|
||||
Subscribers should display them, not add them to a total.
|
||||
|
||||
``cost_usd`` is the USD cost for this turn when known (Anthropic,
|
||||
OpenAI, OpenRouter). 0.0 means unreported (not free).
|
||||
"""
|
||||
data: dict = {
|
||||
"stop_reason": stop_reason,
|
||||
@@ -826,6 +830,7 @@ class EventBus:
|
||||
"output_tokens": output_tokens,
|
||||
"cached_tokens": cached_tokens,
|
||||
"cache_creation_tokens": cache_creation_tokens,
|
||||
"cost_usd": cost_usd,
|
||||
}
|
||||
if iteration is not None:
|
||||
data["iteration"] = iteration
|
||||
|
||||
@@ -360,6 +360,82 @@ FAILED_REQUESTS_DIR = Path.home() / ".hive" / "failed_requests"
|
||||
MAX_FAILED_REQUEST_DUMPS = 50
|
||||
|
||||
|
||||
def _extract_cost(response: Any, model: str) -> float:
|
||||
"""Pull the USD cost for a non-streaming completion response.
|
||||
|
||||
Sources checked, in priority order:
|
||||
1. ``usage.cost`` — populated when OpenRouter returns native cost via
|
||||
``usage: {include: true}`` or when ``litellm.include_cost_in_streaming_usage``
|
||||
is on.
|
||||
2. ``response._hidden_params["response_cost"]`` — set by LiteLLM's
|
||||
logging layer after most successful completions.
|
||||
3. ``litellm.completion_cost(...)`` — computes from the model pricing
|
||||
table; works across Anthropic, OpenAI, and OpenRouter as long as the
|
||||
model is in LiteLLM's catalog.
|
||||
|
||||
Returns 0.0 for unpriced models or unexpected response shapes — cost is a
|
||||
display concern, never let it break the hot path. For streaming paths
|
||||
where the aggregate response isn't a full ``ModelResponse``, use
|
||||
:func:`_cost_from_tokens` with the already-extracted token counts.
|
||||
"""
|
||||
if response is None:
|
||||
return 0.0
|
||||
usage = getattr(response, "usage", None)
|
||||
usage_cost = getattr(usage, "cost", None) if usage is not None else None
|
||||
if isinstance(usage_cost, (int, float)) and usage_cost > 0:
|
||||
return float(usage_cost)
|
||||
|
||||
hidden = getattr(response, "_hidden_params", None)
|
||||
if isinstance(hidden, dict):
|
||||
hp_cost = hidden.get("response_cost")
|
||||
if isinstance(hp_cost, (int, float)) and hp_cost > 0:
|
||||
return float(hp_cost)
|
||||
|
||||
try:
|
||||
import litellm as _litellm
|
||||
|
||||
computed = _litellm.completion_cost(completion_response=response, model=model)
|
||||
if isinstance(computed, (int, float)) and computed > 0:
|
||||
return float(computed)
|
||||
except Exception as exc:
|
||||
logger.debug("[cost] completion_cost failed for %s: %s", model, exc)
|
||||
return 0.0
|
||||
|
||||
|
||||
def _cost_from_tokens(
|
||||
model: str,
|
||||
input_tokens: int,
|
||||
output_tokens: int,
|
||||
cached_tokens: int = 0,
|
||||
cache_creation_tokens: int = 0,
|
||||
) -> float:
|
||||
"""Compute USD cost from already-normalized token counts.
|
||||
|
||||
Used on streaming paths where the aggregate ``response`` is the stream
|
||||
wrapper (not a full ``ModelResponse``) and ``litellm.completion_cost`` on
|
||||
it either no-ops or raises. Calls ``litellm.cost_per_token`` directly
|
||||
with the cache-aware inputs so Anthropic's 5-min-write / cache-read
|
||||
multipliers are applied correctly.
|
||||
"""
|
||||
if not model or (input_tokens == 0 and output_tokens == 0):
|
||||
return 0.0
|
||||
try:
|
||||
import litellm as _litellm
|
||||
|
||||
prompt_cost, completion_cost = _litellm.cost_per_token(
|
||||
model=model,
|
||||
prompt_tokens=input_tokens,
|
||||
completion_tokens=output_tokens,
|
||||
cache_read_input_tokens=cached_tokens,
|
||||
cache_creation_input_tokens=cache_creation_tokens,
|
||||
)
|
||||
total = (prompt_cost or 0.0) + (completion_cost or 0.0)
|
||||
return float(total) if total > 0 else 0.0
|
||||
except Exception as exc:
|
||||
logger.debug("[cost] cost_per_token failed for %s: %s", model, exc)
|
||||
return 0.0
|
||||
|
||||
|
||||
def _extract_cache_tokens(usage: Any) -> tuple[int, int]:
|
||||
"""Pull (cache_read, cache_creation) from a LiteLLM usage object.
|
||||
|
||||
@@ -1115,6 +1191,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
input_tokens = usage.prompt_tokens if usage else 0
|
||||
output_tokens = usage.completion_tokens if usage else 0
|
||||
cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
|
||||
cost_usd = _extract_cost(response, self.model)
|
||||
|
||||
return LLMResponse(
|
||||
content=content,
|
||||
@@ -1123,6 +1200,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
output_tokens=output_tokens,
|
||||
cached_tokens=cached_tokens,
|
||||
cache_creation_tokens=cache_creation_tokens,
|
||||
cost_usd=cost_usd,
|
||||
stop_reason=response.choices[0].finish_reason or "",
|
||||
raw_response=response,
|
||||
)
|
||||
@@ -1338,6 +1416,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
input_tokens = usage.prompt_tokens if usage else 0
|
||||
output_tokens = usage.completion_tokens if usage else 0
|
||||
cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
|
||||
cost_usd = _extract_cost(response, self.model)
|
||||
|
||||
return LLMResponse(
|
||||
content=content,
|
||||
@@ -1346,6 +1425,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
output_tokens=output_tokens,
|
||||
cached_tokens=cached_tokens,
|
||||
cache_creation_tokens=cache_creation_tokens,
|
||||
cost_usd=cost_usd,
|
||||
stop_reason=response.choices[0].finish_reason or "",
|
||||
raw_response=response,
|
||||
)
|
||||
@@ -1821,6 +1901,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
input_tokens = usage.prompt_tokens if usage else 0
|
||||
output_tokens = usage.completion_tokens if usage else 0
|
||||
cached_tokens, cache_creation_tokens = _extract_cache_tokens(usage)
|
||||
cost_usd = _extract_cost(response, self.model)
|
||||
stop_reason = "tool_calls" if tool_calls else (response.choices[0].finish_reason or "stop")
|
||||
|
||||
return LLMResponse(
|
||||
@@ -1830,6 +1911,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
output_tokens=output_tokens,
|
||||
cached_tokens=cached_tokens,
|
||||
cache_creation_tokens=cache_creation_tokens,
|
||||
cost_usd=cost_usd,
|
||||
stop_reason=stop_reason,
|
||||
raw_response={
|
||||
"compat_mode": "openrouter_tool_emulation",
|
||||
@@ -1891,6 +1973,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
output_tokens=response.output_tokens,
|
||||
cached_tokens=response.cached_tokens,
|
||||
cache_creation_tokens=response.cache_creation_tokens,
|
||||
cost_usd=response.cost_usd,
|
||||
model=response.model,
|
||||
)
|
||||
|
||||
@@ -1960,6 +2043,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
output_tokens=response.output_tokens,
|
||||
cached_tokens=response.cached_tokens,
|
||||
cache_creation_tokens=response.cache_creation_tokens,
|
||||
cost_usd=response.cost_usd,
|
||||
model=response.model,
|
||||
)
|
||||
|
||||
@@ -2286,6 +2370,13 @@ class LiteLLMProvider(LLMProvider):
|
||||
choice.finish_reason,
|
||||
self.model,
|
||||
)
|
||||
cost_usd = _cost_from_tokens(
|
||||
self.model,
|
||||
input_tokens,
|
||||
output_tokens,
|
||||
cached_tokens,
|
||||
cache_creation_tokens,
|
||||
)
|
||||
tail_events.append(
|
||||
FinishEvent(
|
||||
stop_reason=choice.finish_reason,
|
||||
@@ -2293,6 +2384,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
output_tokens=output_tokens,
|
||||
cached_tokens=cached_tokens,
|
||||
cache_creation_tokens=cache_creation_tokens,
|
||||
cost_usd=cost_usd,
|
||||
model=self.model,
|
||||
)
|
||||
)
|
||||
@@ -2335,6 +2427,13 @@ class LiteLLMProvider(LLMProvider):
|
||||
cache_creation_tokens,
|
||||
self.model,
|
||||
)
|
||||
cost_usd = _cost_from_tokens(
|
||||
self.model,
|
||||
input_tokens,
|
||||
output_tokens,
|
||||
cached_tokens,
|
||||
cache_creation_tokens,
|
||||
)
|
||||
# Patch the FinishEvent already queued with 0 tokens
|
||||
for _i, _ev in enumerate(tail_events):
|
||||
if isinstance(_ev, FinishEvent) and _ev.input_tokens == 0:
|
||||
@@ -2344,6 +2443,7 @@ class LiteLLMProvider(LLMProvider):
|
||||
output_tokens=output_tokens,
|
||||
cached_tokens=cached_tokens,
|
||||
cache_creation_tokens=cache_creation_tokens,
|
||||
cost_usd=cost_usd,
|
||||
model=_ev.model,
|
||||
)
|
||||
break
|
||||
|
||||
@@ -15,6 +15,10 @@ class LLMResponse:
|
||||
``cached_tokens`` and ``cache_creation_tokens`` are subsets of
|
||||
``input_tokens`` (providers report them inside ``prompt_tokens``).
|
||||
Surface them for visibility; do not add to a total.
|
||||
|
||||
``cost_usd`` is the per-call USD cost when the provider / pricing table
|
||||
can produce one (Anthropic, OpenAI, OpenRouter are supported). 0.0 when
|
||||
unknown or unpriced — treat as "unreported", not "free".
|
||||
"""
|
||||
|
||||
content: str
|
||||
@@ -23,6 +27,7 @@ class LLMResponse:
|
||||
output_tokens: int = 0
|
||||
cached_tokens: int = 0
|
||||
cache_creation_tokens: int = 0
|
||||
cost_usd: float = 0.0
|
||||
stop_reason: str = ""
|
||||
raw_response: Any = None
|
||||
|
||||
@@ -189,6 +194,7 @@ class LLMProvider(ABC):
|
||||
output_tokens=response.output_tokens,
|
||||
cached_tokens=response.cached_tokens,
|
||||
cache_creation_tokens=response.cache_creation_tokens,
|
||||
cost_usd=response.cost_usd,
|
||||
model=response.model,
|
||||
)
|
||||
|
||||
|
||||
@@ -70,6 +70,9 @@ class FinishEvent:
|
||||
``cached_tokens`` and ``cache_creation_tokens`` are subsets of
|
||||
``input_tokens`` — providers count both inside ``prompt_tokens`` already.
|
||||
Surface them separately for visibility; never add to a total.
|
||||
|
||||
``cost_usd`` is the per-turn USD cost when the provider or LiteLLM's
|
||||
pricing table supplies one; 0.0 means unreported (not free).
|
||||
"""
|
||||
|
||||
type: Literal["finish"] = "finish"
|
||||
@@ -78,6 +81,7 @@ class FinishEvent:
|
||||
output_tokens: int = 0
|
||||
cached_tokens: int = 0
|
||||
cache_creation_tokens: int = 0
|
||||
cost_usd: float = 0.0
|
||||
model: str = ""
|
||||
|
||||
|
||||
|
||||
@@ -155,7 +155,7 @@ interface ChatPanelProps {
|
||||
* `cached` (cache reads) and `cacheCreated` (cache writes) are subsets of
|
||||
* `input` — providers count both inside prompt_tokens. Display them
|
||||
* separately; do not add to a total. */
|
||||
tokenUsage?: { input: number; output: number; cached?: number; cacheCreated?: number };
|
||||
tokenUsage?: { input: number; output: number; cached?: number; cacheCreated?: number; costUsd?: number };
|
||||
/** Optional action element rendered on the right side of the "Conversation" header */
|
||||
headerAction?: React.ReactNode;
|
||||
}
|
||||
@@ -1488,17 +1488,35 @@ export default function ChatPanel({
|
||||
{hasTokens && (() => {
|
||||
const cached = tokenUsage!.cached ?? 0;
|
||||
const created = tokenUsage!.cacheCreated ?? 0;
|
||||
const cost = tokenUsage!.costUsd ?? 0;
|
||||
// cached/created are subsets of input — never sum; surface separately.
|
||||
const title = [
|
||||
"LLM tokens used this session",
|
||||
`input ${fmt(tokenUsage!.input)}`,
|
||||
` cache read ${fmt(cached)}`,
|
||||
` cache write ${fmt(created)}`,
|
||||
`output ${fmt(tokenUsage!.output)}`,
|
||||
].join("\n");
|
||||
// Cost can be < $0.01; show 4 decimals so small-model sessions aren't "$0.00".
|
||||
const costStr = cost > 0 ? `$${cost.toFixed(4)}` : "—";
|
||||
return (
|
||||
<span title={title}>
|
||||
<span className="group relative cursor-help transition-colors hover:text-muted-foreground">
|
||||
Tokens: {fmt(tokenUsage!.output)}
|
||||
<span
|
||||
role="tooltip"
|
||||
className="pointer-events-none invisible absolute bottom-full right-0 z-50 mb-2 whitespace-nowrap rounded-md border border-border bg-popover px-3 py-2 text-[11px] text-popover-foreground opacity-0 shadow-lg transition-[opacity,transform] duration-150 translate-y-1 group-hover:visible group-hover:opacity-100 group-hover:translate-y-0"
|
||||
>
|
||||
<span className="mb-1.5 block text-muted-foreground">
|
||||
LLM tokens used this session
|
||||
</span>
|
||||
<span className="grid grid-cols-[auto_1fr] gap-x-4 gap-y-0.5 tabular-nums">
|
||||
<span>Input</span>
|
||||
<span className="text-right">{fmt(tokenUsage!.input)}</span>
|
||||
<span className="pl-3 text-muted-foreground">cache read</span>
|
||||
<span className="text-right text-muted-foreground">{fmt(cached)}</span>
|
||||
<span className="pl-3 text-muted-foreground">cache write</span>
|
||||
<span className="text-right text-muted-foreground">{fmt(created)}</span>
|
||||
<span>Output</span>
|
||||
<span className="text-right">{fmt(tokenUsage!.output)}</span>
|
||||
<span className="mt-1 border-t border-border/50 pt-1">Cost</span>
|
||||
<span className="mt-1 border-t border-border/50 pt-1 text-right font-medium">
|
||||
{costStr}
|
||||
</span>
|
||||
</span>
|
||||
</span>
|
||||
</span>
|
||||
);
|
||||
})()}
|
||||
|
||||
@@ -73,11 +73,14 @@ export default function QueenDM() {
|
||||
const [awaitingInput, setAwaitingInput] = useState(false);
|
||||
// `cached` and `cacheCreated` are subsets of `input` (providers count both
|
||||
// inside prompt_tokens already) — display them, never add them to a total.
|
||||
// `costUsd` is the session-total USD cost when the provider supplies one
|
||||
// (Anthropic, OpenAI, OpenRouter); 0 means unreported, not free.
|
||||
const [tokenUsage, setTokenUsage] = useState({
|
||||
input: 0,
|
||||
output: 0,
|
||||
cached: 0,
|
||||
cacheCreated: 0,
|
||||
costUsd: 0,
|
||||
});
|
||||
const [historySessions, setHistorySessions] = useState<HistorySession[]>([]);
|
||||
const [historyLoading, setHistoryLoading] = useState(false);
|
||||
@@ -125,7 +128,7 @@ export default function QueenDM() {
|
||||
setPendingQuestions(null);
|
||||
setAwaitingInput(false);
|
||||
setQueenPhase("independent");
|
||||
setTokenUsage({ input: 0, output: 0, cached: 0, cacheCreated: 0 });
|
||||
setTokenUsage({ input: 0, output: 0, cached: 0, cacheCreated: 0, costUsd: 0 });
|
||||
setInitialDraft(null);
|
||||
setColonySpawned(false);
|
||||
setSpawnedColonyName(null);
|
||||
@@ -587,11 +590,13 @@ export default function QueenDM() {
|
||||
// separately for display, do NOT roll into input/total.
|
||||
const cached = (event.data.cached_tokens as number) || 0;
|
||||
const cacheCreated = (event.data.cache_creation_tokens as number) || 0;
|
||||
const costUsd = (event.data.cost_usd as number) || 0;
|
||||
setTokenUsage((prev) => ({
|
||||
input: prev.input + inp,
|
||||
output: prev.output + out,
|
||||
cached: prev.cached + cached,
|
||||
cacheCreated: prev.cacheCreated + cacheCreated,
|
||||
costUsd: prev.costUsd + costUsd,
|
||||
}));
|
||||
}
|
||||
// Flush one queued message per LLM turn boundary. This is the
|
||||
|
||||
@@ -814,6 +814,7 @@ class TestConveniencePublishers:
|
||||
output_tokens=50,
|
||||
cached_tokens=30,
|
||||
cache_creation_tokens=10,
|
||||
cost_usd=0.0042,
|
||||
execution_id="exec_1",
|
||||
iteration=3,
|
||||
)
|
||||
@@ -828,6 +829,7 @@ class TestConveniencePublishers:
|
||||
# display, NOT additive to input_tokens.
|
||||
assert received[0].data["cached_tokens"] == 30
|
||||
assert received[0].data["cache_creation_tokens"] == 10
|
||||
assert received[0].data["cost_usd"] == 0.0042
|
||||
assert received[0].data["iteration"] == 3
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@@ -25,8 +25,10 @@ from framework.llm.litellm import (
|
||||
LiteLLMProvider,
|
||||
_build_system_message,
|
||||
_compute_retry_delay,
|
||||
_cost_from_tokens,
|
||||
_ensure_ollama_chat_prefix,
|
||||
_extract_cache_tokens,
|
||||
_extract_cost,
|
||||
_is_ollama_model,
|
||||
_model_supports_cache_control,
|
||||
_summarize_request_for_log,
|
||||
@@ -1512,3 +1514,124 @@ class TestStreamingChunksFallbackPreservesCacheFields:
|
||||
|
||||
assert cached == 5601
|
||||
assert creation == 0
|
||||
|
||||
|
||||
class TestExtractCost:
|
||||
"""`_extract_cost` pulls USD cost from three sources in order:
|
||||
usage.cost (OpenRouter native / include_cost_in_streaming_usage) →
|
||||
response._hidden_params['response_cost'] (LiteLLM logging) →
|
||||
litellm.completion_cost() (pricing-table fallback)."""
|
||||
|
||||
def test_none_response_returns_zero(self):
|
||||
assert _extract_cost(None, "gpt-4o-mini") == 0.0
|
||||
|
||||
def test_openrouter_usage_cost_is_preferred(self):
|
||||
"""OpenRouter returns authoritative per-call cost on usage.cost when
|
||||
the caller opts in (usage.include=true). That beats LiteLLM's
|
||||
pricing-table estimate because it reflects promo pricing and BYOK markup."""
|
||||
response = MagicMock()
|
||||
response.usage = MagicMock(cost=0.00123)
|
||||
response._hidden_params = {"response_cost": 99.99} # should be ignored
|
||||
assert _extract_cost(response, "openrouter/anthropic/claude-opus-4.5") == 0.00123
|
||||
|
||||
def test_hidden_params_response_cost_used_when_no_usage_cost(self):
|
||||
"""LiteLLM's logging layer attaches response_cost after most
|
||||
completions — this is how OpenAI/Anthropic responses get costed
|
||||
without going back to the pricing table."""
|
||||
response = MagicMock()
|
||||
response.usage = MagicMock(spec=[]) # no .cost attribute
|
||||
response._hidden_params = {"response_cost": 0.0042}
|
||||
assert _extract_cost(response, "gpt-4o-mini") == 0.0042
|
||||
|
||||
def test_falls_back_to_completion_cost_when_nothing_pre_populated(self):
|
||||
"""For providers where LiteLLM didn't pre-populate cost, call
|
||||
litellm.completion_cost() against the pricing table. Mocked here
|
||||
because we don't want tests depending on the exact price of
|
||||
claude-sonnet-4.5 in LiteLLM's model map."""
|
||||
response = MagicMock()
|
||||
response.usage = MagicMock(spec=[])
|
||||
response._hidden_params = {}
|
||||
with patch("litellm.completion_cost", return_value=0.00789):
|
||||
assert _extract_cost(response, "anthropic/claude-sonnet-4.5") == 0.00789
|
||||
|
||||
def test_completion_cost_exception_returns_zero(self):
|
||||
"""Unpriced models (e.g. new OpenRouter routes not yet in LiteLLM's
|
||||
catalog) must not crash the hot path."""
|
||||
response = MagicMock()
|
||||
response.usage = MagicMock(spec=[])
|
||||
response._hidden_params = {}
|
||||
with patch("litellm.completion_cost", side_effect=Exception("no pricing")):
|
||||
assert _extract_cost(response, "openrouter/mystery/model") == 0.0
|
||||
|
||||
def test_zero_cost_falls_through_to_next_source(self):
|
||||
"""usage.cost == 0 should NOT short-circuit; fall through to
|
||||
_hidden_params / completion_cost so we don't cement a false zero."""
|
||||
response = MagicMock()
|
||||
response.usage = MagicMock(cost=0.0)
|
||||
response._hidden_params = {"response_cost": 0.0055}
|
||||
assert _extract_cost(response, "gpt-4o-mini") == 0.0055
|
||||
|
||||
|
||||
class TestCostFromTokens:
|
||||
"""`_cost_from_tokens` is the streaming-path cost helper: stream wrappers
|
||||
don't expose the full ModelResponse shape that completion_cost() expects,
|
||||
so we go through cost_per_token() with the already-extracted totals."""
|
||||
|
||||
def test_zero_tokens_returns_zero_without_calling_litellm(self):
|
||||
with patch("litellm.cost_per_token") as mock:
|
||||
assert _cost_from_tokens("claude-opus-4.5", 0, 0) == 0.0
|
||||
mock.assert_not_called()
|
||||
|
||||
def test_empty_model_returns_zero(self):
|
||||
assert _cost_from_tokens("", 1000, 500) == 0.0
|
||||
|
||||
def test_computes_from_tokens(self):
|
||||
with patch("litellm.cost_per_token", return_value=(0.001, 0.002)) as mock:
|
||||
cost = _cost_from_tokens(
|
||||
"anthropic/claude-opus-4.5",
|
||||
input_tokens=1000,
|
||||
output_tokens=500,
|
||||
cached_tokens=200,
|
||||
cache_creation_tokens=100,
|
||||
)
|
||||
assert cost == pytest.approx(0.003)
|
||||
# Verify the cache-aware kwargs are threaded through — Anthropic
|
||||
# needs these to apply the 1.25x write / 0.1x read multipliers.
|
||||
call_kwargs = mock.call_args.kwargs
|
||||
assert call_kwargs["prompt_tokens"] == 1000
|
||||
assert call_kwargs["completion_tokens"] == 500
|
||||
assert call_kwargs["cache_read_input_tokens"] == 200
|
||||
assert call_kwargs["cache_creation_input_tokens"] == 100
|
||||
|
||||
def test_exception_returns_zero(self):
|
||||
with patch("litellm.cost_per_token", side_effect=Exception("unpriced")):
|
||||
assert _cost_from_tokens("mystery/model", 1000, 500) == 0.0
|
||||
|
||||
def test_negative_or_none_components_coerce_to_zero(self):
|
||||
"""LiteLLM returns (None, None) for unknown models in some versions;
|
||||
treat as 0 rather than crashing on None+None."""
|
||||
with patch("litellm.cost_per_token", return_value=(None, None)):
|
||||
assert _cost_from_tokens("some/model", 1, 1) == 0.0
|
||||
|
||||
|
||||
class TestLLMResponseAndFinishEventHaveCostUsd:
|
||||
"""Regression: both LLMResponse and FinishEvent must carry cost_usd so
|
||||
the agent loop → event bus → frontend pipeline doesn't lose cost."""
|
||||
|
||||
def test_llm_response_defaults_cost_to_zero(self):
|
||||
from framework.llm.provider import LLMResponse
|
||||
|
||||
r = LLMResponse(content="", model="m")
|
||||
assert r.cost_usd == 0.0
|
||||
|
||||
def test_finish_event_defaults_cost_to_zero(self):
|
||||
from framework.llm.stream_events import FinishEvent
|
||||
|
||||
e = FinishEvent()
|
||||
assert e.cost_usd == 0.0
|
||||
|
||||
def test_finish_event_accepts_cost(self):
|
||||
from framework.llm.stream_events import FinishEvent
|
||||
|
||||
e = FinishEvent(cost_usd=0.0123)
|
||||
assert e.cost_usd == 0.0123
|
||||
|
||||
@@ -242,6 +242,7 @@ class TestEventSerialization:
|
||||
"output_tokens": 20,
|
||||
"cached_tokens": 0,
|
||||
"cache_creation_tokens": 0,
|
||||
"cost_usd": 0.0,
|
||||
"model": "gpt-4",
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user