mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(usage): read top-level Anthropic cache fields from OAI-compatible proxies
Port from cline/cline#10266. When OpenAI-compatible proxies (OpenRouter, Vercel AI Gateway, Cline) route Claude models, they sometimes surface the Anthropic-native cache counters (`cache_read_input_tokens`, `cache_creation_input_tokens`) at the top level of the `usage` object instead of nesting them inside `prompt_tokens_details`. Our chat-completions branch of `normalize_usage()` only read the nested `prompt_tokens_details` fields, so those responses: - reported `cache_write_tokens = 0` even when the model actually did a prompt-cache write, - reported only some of the cache-read tokens when the proxy exposed them top-level only, - overstated `input_tokens` by the missed cache-write amount, which in turn made cost estimation and the status-bar cache-hit percentage wrong for Claude traffic going through these gateways. Now the chat-completions branch tries the OpenAI-standard `prompt_tokens_details` first and falls back to the top-level Anthropic-shape fields only if the nested values are absent/zero. The Anthropic and Codex Responses branches are unchanged. Regression guards added for three shapes: top-level write + nested read, top-level-only, and both-present (nested wins).
This commit is contained in:
parent
402d048eb6
commit
a369987443
2 changed files with 79 additions and 0 deletions
|
|
@ -533,10 +533,22 @@ def normalize_usage(
|
||||||
prompt_total = _to_int(getattr(response_usage, "prompt_tokens", 0))
|
prompt_total = _to_int(getattr(response_usage, "prompt_tokens", 0))
|
||||||
output_tokens = _to_int(getattr(response_usage, "completion_tokens", 0))
|
output_tokens = _to_int(getattr(response_usage, "completion_tokens", 0))
|
||||||
details = getattr(response_usage, "prompt_tokens_details", None)
|
details = getattr(response_usage, "prompt_tokens_details", None)
|
||||||
|
# Primary: OpenAI-style prompt_tokens_details. Fallback: Anthropic-style
|
||||||
|
# top-level fields that some OpenAI-compatible proxies (OpenRouter, Vercel
|
||||||
|
# AI Gateway, Cline) expose when routing Claude models — without this
|
||||||
|
# fallback, cache writes are undercounted as 0 and cache reads can be
|
||||||
|
# missed when the proxy only surfaces them at the top level.
|
||||||
|
# Port of cline/cline#10266.
|
||||||
cache_read_tokens = _to_int(getattr(details, "cached_tokens", 0) if details else 0)
|
cache_read_tokens = _to_int(getattr(details, "cached_tokens", 0) if details else 0)
|
||||||
|
if not cache_read_tokens:
|
||||||
|
cache_read_tokens = _to_int(getattr(response_usage, "cache_read_input_tokens", 0))
|
||||||
cache_write_tokens = _to_int(
|
cache_write_tokens = _to_int(
|
||||||
getattr(details, "cache_write_tokens", 0) if details else 0
|
getattr(details, "cache_write_tokens", 0) if details else 0
|
||||||
)
|
)
|
||||||
|
if not cache_write_tokens:
|
||||||
|
cache_write_tokens = _to_int(
|
||||||
|
getattr(response_usage, "cache_creation_input_tokens", 0)
|
||||||
|
)
|
||||||
input_tokens = max(0, prompt_total - cache_read_tokens - cache_write_tokens)
|
input_tokens = max(0, prompt_total - cache_read_tokens - cache_write_tokens)
|
||||||
|
|
||||||
reasoning_tokens = 0
|
reasoning_tokens = 0
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,73 @@ def test_normalize_usage_openai_subtracts_cached_prompt_tokens():
|
||||||
assert normalized.output_tokens == 700
|
assert normalized.output_tokens == 700
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_usage_openai_reads_top_level_anthropic_cache_fields():
|
||||||
|
"""Some OpenAI-compatible proxies (OpenRouter, Vercel AI Gateway, Cline) expose
|
||||||
|
Anthropic-style cache token counts at the top level of the usage object when
|
||||||
|
routing Claude models, instead of nesting them in prompt_tokens_details.
|
||||||
|
|
||||||
|
Regression guard for the bug fixed in cline/cline#10266 — before this fix,
|
||||||
|
the chat-completions branch of normalize_usage() only read
|
||||||
|
prompt_tokens_details.cache_write_tokens and completely missed the
|
||||||
|
cache_creation_input_tokens case, so cache writes showed as 0 and reflected
|
||||||
|
inputTokens were overstated by the cache-write amount.
|
||||||
|
"""
|
||||||
|
usage = SimpleNamespace(
|
||||||
|
prompt_tokens=1000,
|
||||||
|
completion_tokens=200,
|
||||||
|
prompt_tokens_details=SimpleNamespace(cached_tokens=500),
|
||||||
|
cache_creation_input_tokens=300,
|
||||||
|
)
|
||||||
|
|
||||||
|
normalized = normalize_usage(usage, provider="openrouter", api_mode="chat_completions")
|
||||||
|
|
||||||
|
# Expected: cache read from prompt_tokens_details.cached_tokens (preferred),
|
||||||
|
# cache write from top-level cache_creation_input_tokens (fallback).
|
||||||
|
assert normalized.cache_read_tokens == 500
|
||||||
|
assert normalized.cache_write_tokens == 300
|
||||||
|
# input_tokens = prompt_total - cache_read - cache_write = 1000 - 500 - 300 = 200
|
||||||
|
assert normalized.input_tokens == 200
|
||||||
|
assert normalized.output_tokens == 200
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_usage_openai_reads_top_level_cache_read_when_details_missing():
|
||||||
|
"""Some proxies expose only top-level Anthropic-style fields with no
|
||||||
|
prompt_tokens_details object. Regression guard for cline/cline#10266.
|
||||||
|
"""
|
||||||
|
usage = SimpleNamespace(
|
||||||
|
prompt_tokens=1000,
|
||||||
|
completion_tokens=200,
|
||||||
|
cache_read_input_tokens=500,
|
||||||
|
cache_creation_input_tokens=300,
|
||||||
|
)
|
||||||
|
|
||||||
|
normalized = normalize_usage(usage, provider="openrouter", api_mode="chat_completions")
|
||||||
|
|
||||||
|
assert normalized.cache_read_tokens == 500
|
||||||
|
assert normalized.cache_write_tokens == 300
|
||||||
|
assert normalized.input_tokens == 200
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_usage_openai_prefers_prompt_tokens_details_over_top_level():
|
||||||
|
"""When both prompt_tokens_details and top-level Anthropic fields are
|
||||||
|
present, we prefer the OpenAI-standard nested fields. Top-level Anthropic
|
||||||
|
fields are only a fallback when the nested ones are absent/zero.
|
||||||
|
"""
|
||||||
|
usage = SimpleNamespace(
|
||||||
|
prompt_tokens=1000,
|
||||||
|
completion_tokens=200,
|
||||||
|
prompt_tokens_details=SimpleNamespace(cached_tokens=600, cache_write_tokens=150),
|
||||||
|
# Intentionally different values — proving we ignore these when details exist.
|
||||||
|
cache_read_input_tokens=999,
|
||||||
|
cache_creation_input_tokens=999,
|
||||||
|
)
|
||||||
|
|
||||||
|
normalized = normalize_usage(usage, provider="openrouter", api_mode="chat_completions")
|
||||||
|
|
||||||
|
assert normalized.cache_read_tokens == 600
|
||||||
|
assert normalized.cache_write_tokens == 150
|
||||||
|
|
||||||
|
|
||||||
def test_openrouter_models_api_pricing_is_converted_from_per_token_to_per_million(monkeypatch):
|
def test_openrouter_models_api_pricing_is_converted_from_per_token_to_per_million(monkeypatch):
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
"agent.usage_pricing.fetch_model_metadata",
|
"agent.usage_pricing.fetch_model_metadata",
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue