From b9463e32c6e240636f7dda68aec8d74cc479b0c8 Mon Sep 17 00:00:00 2001
From: Teknium <teknium1@gmail.com>
Date: Wed, 22 Apr 2026 17:03:35 -0700
Subject: [PATCH] fix(usage): read top-level Anthropic cache fields from
 OAI-compatible proxies

Port from cline/cline#10266.

When OpenAI-compatible proxies (OpenRouter, Vercel AI Gateway, Cline)
route Claude models, they sometimes surface the Anthropic-native cache
counters (`cache_read_input_tokens`, `cache_creation_input_tokens`) at
the top level of the `usage` object instead of nesting them inside
`prompt_tokens_details`. Our chat-completions branch of
`normalize_usage()` only read the nested `prompt_tokens_details` fields,
so those responses:

- reported `cache_write_tokens = 0` even when the model actually did a
  prompt-cache write,
- reported only some of the cache-read tokens when the proxy exposed them
  top-level only,
- overstated `input_tokens` by the missed cache-write amount, which in
  turn made cost estimation and the status-bar cache-hit percentage wrong
  for Claude traffic going through these gateways.

Now the chat-completions branch tries the OpenAI-standard
`prompt_tokens_details` first and falls back to the top-level
Anthropic-shape fields only if the nested values are absent/zero. The
Anthropic and Codex Responses branches are unchanged.

Regression guards added for three shapes: top-level write + nested read,
top-level-only, and both-present (nested wins).
---
 agent/usage_pricing.py            | 12 ++++++
 tests/agent/test_usage_pricing.py | 67 +++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)

diff --git a/agent/usage_pricing.py b/agent/usage_pricing.py
index 3554c5b99..1dfe59ea3 100644
--- a/agent/usage_pricing.py
+++ b/agent/usage_pricing.py
@@ -533,10 +533,22 @@ def normalize_usage(
         prompt_total = _to_int(getattr(response_usage, "prompt_tokens", 0))
         output_tokens = _to_int(getattr(response_usage, "completion_tokens", 0))
         details = getattr(response_usage, "prompt_tokens_details", None)
+        # Primary: OpenAI-style prompt_tokens_details. Fallback: Anthropic-style
+        # top-level fields that some OpenAI-compatible proxies (OpenRouter, Vercel
+        # AI Gateway, Cline) expose when routing Claude models — without this
+        # fallback, cache writes are undercounted as 0 and cache reads can be
+        # missed when the proxy only surfaces them at the top level.
+        # Port of cline/cline#10266.
         cache_read_tokens = _to_int(getattr(details, "cached_tokens", 0) if details else 0)
+        if not cache_read_tokens:
+            cache_read_tokens = _to_int(getattr(response_usage, "cache_read_input_tokens", 0))
         cache_write_tokens = _to_int(
             getattr(details, "cache_write_tokens", 0) if details else 0
         )
+        if not cache_write_tokens:
+            cache_write_tokens = _to_int(
+                getattr(response_usage, "cache_creation_input_tokens", 0)
+            )
         input_tokens = max(0, prompt_total - cache_read_tokens - cache_write_tokens)
 
     reasoning_tokens = 0
diff --git a/tests/agent/test_usage_pricing.py b/tests/agent/test_usage_pricing.py
index a65668bb4..5daace97d 100644
--- a/tests/agent/test_usage_pricing.py
+++ b/tests/agent/test_usage_pricing.py
@@ -39,6 +39,73 @@ def test_normalize_usage_openai_subtracts_cached_prompt_tokens():
     assert normalized.output_tokens == 700
 
 
+def test_normalize_usage_openai_reads_top_level_anthropic_cache_fields():
+    """Some OpenAI-compatible proxies (OpenRouter, Vercel AI Gateway, Cline) expose
+    Anthropic-style cache token counts at the top level of the usage object when
+    routing Claude models, instead of nesting them in prompt_tokens_details.
+
+    Regression guard for the bug fixed in cline/cline#10266 — before this fix,
+    the chat-completions branch of normalize_usage() only read
+    prompt_tokens_details.cache_write_tokens and completely missed the
+    cache_creation_input_tokens case, so cache writes showed as 0 and reflected
+    inputTokens were overstated by the cache-write amount.
+    """
+    usage = SimpleNamespace(
+        prompt_tokens=1000,
+        completion_tokens=200,
+        prompt_tokens_details=SimpleNamespace(cached_tokens=500),
+        cache_creation_input_tokens=300,
+    )
+
+    normalized = normalize_usage(usage, provider="openrouter", api_mode="chat_completions")
+
+    # Expected: cache read from prompt_tokens_details.cached_tokens (preferred),
+    # cache write from top-level cache_creation_input_tokens (fallback).
+    assert normalized.cache_read_tokens == 500
+    assert normalized.cache_write_tokens == 300
+    # input_tokens = prompt_total - cache_read - cache_write = 1000 - 500 - 300 = 200
+    assert normalized.input_tokens == 200
+    assert normalized.output_tokens == 200
+
+
+def test_normalize_usage_openai_reads_top_level_cache_read_when_details_missing():
+    """Some proxies expose only top-level Anthropic-style fields with no
+    prompt_tokens_details object. Regression guard for cline/cline#10266.
+    """
+    usage = SimpleNamespace(
+        prompt_tokens=1000,
+        completion_tokens=200,
+        cache_read_input_tokens=500,
+        cache_creation_input_tokens=300,
+    )
+
+    normalized = normalize_usage(usage, provider="openrouter", api_mode="chat_completions")
+
+    assert normalized.cache_read_tokens == 500
+    assert normalized.cache_write_tokens == 300
+    assert normalized.input_tokens == 200
+
+
+def test_normalize_usage_openai_prefers_prompt_tokens_details_over_top_level():
+    """When both prompt_tokens_details and top-level Anthropic fields are
+    present, we prefer the OpenAI-standard nested fields. Top-level Anthropic
+    fields are only a fallback when the nested ones are absent/zero.
+    """
+    usage = SimpleNamespace(
+        prompt_tokens=1000,
+        completion_tokens=200,
+        prompt_tokens_details=SimpleNamespace(cached_tokens=600, cache_write_tokens=150),
+        # Intentionally different values — proving we ignore these when details exist.
+        cache_read_input_tokens=999,
+        cache_creation_input_tokens=999,
+    )
+
+    normalized = normalize_usage(usage, provider="openrouter", api_mode="chat_completions")
+
+    assert normalized.cache_read_tokens == 600
+    assert normalized.cache_write_tokens == 150
+
+
 def test_openrouter_models_api_pricing_is_converted_from_per_token_to_per_million(monkeypatch):
     monkeypatch.setattr(
         "agent.usage_pricing.fetch_model_metadata",