From 432772dbdf63eae379b76b3811c51284c36bb817 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Tue, 21 Apr 2026 06:42:32 -0700 Subject: [PATCH] fix(cache): surface cache-hit telemetry for all providers, not just Anthropic-wire (#13543) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 💾 Cache footer was gated on `self._use_prompt_caching`, which is only True for Anthropic marker injection (native Anthropic, OpenRouter Claude, Anthropic-wire gateways, Qwen on OpenCode/Alibaba). Providers with automatic server-side prefix caching — OpenAI, Kimi, DeepSeek, Qwen on OpenRouter — return `prompt_tokens_details.cached_tokens` too, but users couldn't see their cache % because the display path never fired for them. Result: people couldn't tell their cache was working or broken without grepping agent.log. `canonical_usage` from `normalize_usage()` already unifies all three API shapes (Anthropic / Codex Responses / OpenAI chat completions) into `cache_read_tokens` and `cache_write_tokens`. Drop the gate and read from there — now the footer fires whenever the provider reported any cached or written tokens, regardless of whether hermes injected markers. Also removes duplicated branch-per-API-shape extraction code. --- run_agent.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/run_agent.py b/run_agent.py index fadf28b31..c5881b87f 100644 --- a/run_agent.py +++ b/run_agent.py @@ -9907,22 +9907,27 @@ class AIAgent: if self.verbose_logging: logging.debug(f"Token usage: prompt={usage_dict['prompt_tokens']:,}, completion={usage_dict['completion_tokens']:,}, total={usage_dict['total_tokens']:,}") - # Log cache hit stats when prompt caching is active - if self._use_prompt_caching: - if self.api_mode == "anthropic_messages": - _tcs = self._get_anthropic_transport() - _cache = _tcs.extract_cache_stats(response) - cached = _cache["cached_tokens"] if _cache else 0 - written = _cache["creation_tokens"] if _cache else 0 - else: - # OpenRouter uses prompt_tokens_details.cached_tokens - details = getattr(response.usage, 'prompt_tokens_details', None) - cached = getattr(details, 'cached_tokens', 0) or 0 if details else 0 - written = getattr(details, 'cache_write_tokens', 0) or 0 if details else 0 - prompt = usage_dict["prompt_tokens"] + # Surface cache hit stats for any provider that reports + # them — not just those where we inject cache_control + # markers. OpenAI/Kimi/DeepSeek/Qwen all do automatic + # server-side prefix caching and return + # ``prompt_tokens_details.cached_tokens``; users + # previously could not see their cache % because this + # line was gated on ``_use_prompt_caching``, which is + # only True for Anthropic-style marker injection. + # ``canonical_usage`` is already normalised from all + # three API shapes (Anthropic / Codex / OpenAI-chat) + # so we can rely on its values directly. + cached = canonical_usage.cache_read_tokens + written = canonical_usage.cache_write_tokens + prompt = usage_dict["prompt_tokens"] + if (cached or written) and not self.quiet_mode: hit_pct = (cached / prompt * 100) if prompt > 0 else 0 - if not self.quiet_mode: - self._vprint(f"{self.log_prefix} 💾 Cache: {cached:,}/{prompt:,} tokens ({hit_pct:.0f}% hit, {written:,} written)") + self._vprint( + f"{self.log_prefix} 💾 Cache: " + f"{cached:,}/{prompt:,} tokens " + f"({hit_pct:.0f}% hit, {written:,} written)" + ) has_retried_429 = False # Reset on success # Clear Nous rate limit state on successful request —