mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(cache): surface cache-hit telemetry for all providers, not just Anthropic-wire (#13543)
The 💾 Cache footer was gated on `self._use_prompt_caching`, which is
only True for Anthropic marker injection (native Anthropic, OpenRouter
Claude, Anthropic-wire gateways, Qwen on OpenCode/Alibaba). Providers
with automatic server-side prefix caching — OpenAI, Kimi, DeepSeek,
Qwen on OpenRouter — return `prompt_tokens_details.cached_tokens` too,
but users couldn't see their cache % because the display path never
fired for them. Result: people couldn't tell their cache was working or
broken without grepping agent.log.
`canonical_usage` from `normalize_usage()` already unifies all three
API shapes (Anthropic / Codex Responses / OpenAI chat completions) into
`cache_read_tokens` and `cache_write_tokens`. Drop the gate and read
from there — now the footer fires whenever the provider reported any
cached or written tokens, regardless of whether hermes injected markers.
Also removes duplicated branch-per-API-shape extraction code.
This commit is contained in:
parent
5e0eed470f
commit
432772dbdf
1 changed files with 20 additions and 15 deletions
35
run_agent.py
35
run_agent.py
|
|
@ -9907,22 +9907,27 @@ class AIAgent:
|
||||||
if self.verbose_logging:
|
if self.verbose_logging:
|
||||||
logging.debug(f"Token usage: prompt={usage_dict['prompt_tokens']:,}, completion={usage_dict['completion_tokens']:,}, total={usage_dict['total_tokens']:,}")
|
logging.debug(f"Token usage: prompt={usage_dict['prompt_tokens']:,}, completion={usage_dict['completion_tokens']:,}, total={usage_dict['total_tokens']:,}")
|
||||||
|
|
||||||
# Log cache hit stats when prompt caching is active
|
# Surface cache hit stats for any provider that reports
|
||||||
if self._use_prompt_caching:
|
# them — not just those where we inject cache_control
|
||||||
if self.api_mode == "anthropic_messages":
|
# markers. OpenAI/Kimi/DeepSeek/Qwen all do automatic
|
||||||
_tcs = self._get_anthropic_transport()
|
# server-side prefix caching and return
|
||||||
_cache = _tcs.extract_cache_stats(response)
|
# ``prompt_tokens_details.cached_tokens``; users
|
||||||
cached = _cache["cached_tokens"] if _cache else 0
|
# previously could not see their cache % because this
|
||||||
written = _cache["creation_tokens"] if _cache else 0
|
# line was gated on ``_use_prompt_caching``, which is
|
||||||
else:
|
# only True for Anthropic-style marker injection.
|
||||||
# OpenRouter uses prompt_tokens_details.cached_tokens
|
# ``canonical_usage`` is already normalised from all
|
||||||
details = getattr(response.usage, 'prompt_tokens_details', None)
|
# three API shapes (Anthropic / Codex / OpenAI-chat)
|
||||||
cached = getattr(details, 'cached_tokens', 0) or 0 if details else 0
|
# so we can rely on its values directly.
|
||||||
written = getattr(details, 'cache_write_tokens', 0) or 0 if details else 0
|
cached = canonical_usage.cache_read_tokens
|
||||||
prompt = usage_dict["prompt_tokens"]
|
written = canonical_usage.cache_write_tokens
|
||||||
|
prompt = usage_dict["prompt_tokens"]
|
||||||
|
if (cached or written) and not self.quiet_mode:
|
||||||
hit_pct = (cached / prompt * 100) if prompt > 0 else 0
|
hit_pct = (cached / prompt * 100) if prompt > 0 else 0
|
||||||
if not self.quiet_mode:
|
self._vprint(
|
||||||
self._vprint(f"{self.log_prefix} 💾 Cache: {cached:,}/{prompt:,} tokens ({hit_pct:.0f}% hit, {written:,} written)")
|
f"{self.log_prefix} 💾 Cache: "
|
||||||
|
f"{cached:,}/{prompt:,} tokens "
|
||||||
|
f"({hit_pct:.0f}% hit, {written:,} written)"
|
||||||
|
)
|
||||||
|
|
||||||
has_retried_429 = False # Reset on success
|
has_retried_429 = False # Reset on success
|
||||||
# Clear Nous rate limit state on successful request —
|
# Clear Nous rate limit state on successful request —
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue