fix(cache): surface cache-hit telemetry for all providers, not just Anthropic-wire (#13543)

The 💾 Cache footer was gated on `self._use_prompt_caching`, which is only True for Anthropic marker injection (native Anthropic, OpenRouter Claude, Anthropic-wire gateways, Qwen on OpenCode/Alibaba). Providers with automatic server-side prefix caching — OpenAI, Kimi, DeepSeek, Qwen on OpenRouter — return `prompt_tokens_details.cached_tokens` too, but users couldn't see their cache % because the display path never fired for them. Result: people couldn't tell their cache was working or broken without grepping agent.log. `canonical_usage` from `normalize_usage()` already unifies all three API shapes (Anthropic / Codex Responses / OpenAI chat completions) into `cache_read_tokens` and `cache_write_tokens`. Drop the gate and read from there — now the footer fires whenever the provider reported any cached or written tokens, regardless of whether hermes injected markers. Also removes duplicated branch-per-API-shape extraction code.
2026-04-25 00:51:20 +00:00 · 2026-04-21 06:42:32 -07:00 · 2026-04-21 06:42:32 -07:00 · 432772dbdf
commit 432772dbdf
parent 5e0eed470f
1 changed files with 20 additions and 15 deletions
--- a/run_agent.py
+++ b/run_agent.py
@ -9907,22 +9907,27 @@ class AIAgent:
                        if self.verbose_logging:
                            logging.debug(f"Token usage: prompt={usage_dict['prompt_tokens']:,}, completion={usage_dict['completion_tokens']:,}, total={usage_dict['total_tokens']:,}")
-                        # Log cache hit stats when prompt caching is active
+                        # Surface cache hit stats for any provider that reports
-                        if self._use_prompt_caching:
+                        # them — not just those where we inject cache_control
-                            if self.api_mode == "anthropic_messages":
+                        # markers.  OpenAI/Kimi/DeepSeek/Qwen all do automatic
-                                _tcs = self._get_anthropic_transport()
+                        # server-side prefix caching and return
-                                _cache = _tcs.extract_cache_stats(response)
+                        # ``prompt_tokens_details.cached_tokens``; users
-                                cached = _cache["cached_tokens"] if _cache else 0
+                        # previously could not see their cache % because this
-                                written = _cache["creation_tokens"] if _cache else 0
+                        # line was gated on ``_use_prompt_caching``, which is
-                            else:
+                        # only True for Anthropic-style marker injection.
-                                # OpenRouter uses prompt_tokens_details.cached_tokens
+                        # ``canonical_usage`` is already normalised from all
-                                details = getattr(response.usage, 'prompt_tokens_details', None)
+                        # three API shapes (Anthropic / Codex / OpenAI-chat)
-                                cached = getattr(details, 'cached_tokens', 0) or 0 if details else 0
+                        # so we can rely on its values directly.
-                                written = getattr(details, 'cache_write_tokens', 0) or 0 if details else 0
+                        cached = canonical_usage.cache_read_tokens
-                            prompt = usage_dict["prompt_tokens"]
+                        written = canonical_usage.cache_write_tokens
                        prompt = usage_dict["prompt_tokens"]
                        if (cached or written) and not self.quiet_mode:
                            hit_pct = (cached / prompt * 100) if prompt > 0 else 0
-                            if not self.quiet_mode:
+                            self._vprint(
-                                self._vprint(f"{self.log_prefix}   💾 Cache: {cached:,}/{prompt:,} tokens ({hit_pct:.0f}% hit, {written:,} written)")
+                                f"{self.log_prefix}   💾 Cache: "
                                f"{cached:,}/{prompt:,} tokens "
                                f"({hit_pct:.0f}% hit, {written:,} written)"
                            )
                    has_retried_429 = False  # Reset on success
                    # Clear Nous rate limit state on successful request —