From 9f1c16a7fbb413d6e7d41802052fb225d6b4d8bf Mon Sep 17 00:00:00 2001
From: kamonspecial <10783249+kamonspecial@users.noreply.github.com>
Date: Sun, 7 Jun 2026 00:06:39 +0900
Subject: [PATCH] fix(langfuse): restore usage/cost when post_api_request sends
 a sanitized response

on_post_llm_call extracted usage via `if response is not None:`, taking the
response-object path. But post_api_request delivers `response` as a sanitized
dict (no `.usage` attribute) alongside a separate `usage` summary dict, so
`getattr(response, "usage")` was always None and token/cost data was dropped
for every gateway turn (traces showed usage 0 / cost 0).

Gate on a real `.usage` attribute so the existing usage-dict fallback is
reached. Real response objects (post_llm_call / legacy) still take the
response-object path. Adds regression tests for both paths.
---
 plugins/observability/langfuse/__init__.py | 12 +++-
 tests/plugins/test_langfuse_plugin.py      | 73 ++++++++++++++++++++++
 2 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/plugins/observability/langfuse/__init__.py b/plugins/observability/langfuse/__init__.py
index 8516030fb01..a18ebf98fc9 100644
--- a/plugins/observability/langfuse/__init__.py
+++ b/plugins/observability/langfuse/__init__.py
@@ -837,8 +837,16 @@ def on_post_llm_call(*, task_id: str = "", session_id: str = "", provider: str =
     if output.get("tool_calls"):
         state.turn_tool_calls.extend(output["tool_calls"])
 
-    # Extract usage: prefer response object, fall back to usage dict from post_api_request
-    if response is not None:
+    # Extract usage: prefer a real response object that carries usage, else
+    # fall back to the usage summary dict from post_api_request.
+    #
+    # post_api_request passes `response` as a SANITIZED dict (no ``.usage``
+    # attribute) alongside a separate `usage` summary dict. Gating on
+    # ``response is not None`` here took the response-object path on that dict,
+    # where ``getattr(response, "usage", None)`` is always None — so usage and
+    # cost were silently dropped for every gateway turn. Gate on a real
+    # ``.usage`` attribute instead so the usage-dict fallback below is reached.
+    if getattr(response, "usage", None) is not None:
         usage_details, cost_details = _usage_and_cost(
             response,
             provider=provider,
diff --git a/tests/plugins/test_langfuse_plugin.py b/tests/plugins/test_langfuse_plugin.py
index 313d2e94a72..51c8c3f4635 100644
--- a/tests/plugins/test_langfuse_plugin.py
+++ b/tests/plugins/test_langfuse_plugin.py
@@ -704,3 +704,76 @@ class TestToolObservationKeying:
         assert ended["output"] == {"status": "done"}
         assert not state.tools
 
+
+class TestUsageFromSanitizedResponse:
+    """Regression: ``post_api_request`` delivers ``response`` as a sanitized
+    dict (no ``.usage`` attribute) plus a separate ``usage`` summary dict. The
+    post-call handler must read the ``usage`` dict instead of treating the dict
+    response as a usage-bearing object and dropping all token/cost data."""
+
+    def _setup(self, mod, monkeypatch):
+        # Active client so on_post_llm_call does not early-return.
+        monkeypatch.setattr(mod, "_get_langfuse", lambda: object())
+        observation = object()
+        state = mod.TraceState(trace_id="trace-1", root_ctx=None, root_span=None)
+        state.generations[mod._request_key(1)] = observation
+        monkeypatch.setitem(mod._TRACE_STATE, mod._trace_key("task-1", "session-1"), state)
+        captured = {}
+
+        def fake_end_observation(obs, *, output=None, metadata=None, usage_details=None, cost_details=None):
+            captured["usage_details"] = usage_details
+
+        monkeypatch.setattr(mod, "_end_observation", fake_end_observation)
+        return captured
+
+    def test_sanitized_dict_response_uses_usage_dict(self, monkeypatch):
+        sys.modules.pop("plugins.observability.langfuse", None)
+        mod = importlib.import_module("plugins.observability.langfuse")
+        captured = self._setup(mod, monkeypatch)
+
+        # A plain dict has no ``.usage`` attribute — mirrors post_api_request.
+        mod.on_post_llm_call(
+            task_id="task-1",
+            session_id="session-1",
+            api_call_count=1,
+            model="gemini-3-flash-preview",
+            response={"model": "gemini-3-flash-preview", "usage": {"input_tokens": 100, "output_tokens": 20}},
+            usage={"input_tokens": 100, "output_tokens": 20},
+            assistant_content_chars=42,
+        )
+
+        # Before the fix the dict response shadowed the usage dict and tokens
+        # were lost (usage_details == {}).
+        assert captured["usage_details"] == {"input": 100, "output": 20}
+
+    def test_real_response_object_with_usage_still_used(self, monkeypatch):
+        sys.modules.pop("plugins.observability.langfuse", None)
+        mod = importlib.import_module("plugins.observability.langfuse")
+        captured = self._setup(mod, monkeypatch)
+
+        # A response object that genuinely carries usage must still take the
+        # response-object path (post_llm_call / legacy behavior).
+        seen = {}
+
+        def fake_usage_and_cost(resp, **_):
+            seen["resp"] = resp
+            return {"input": 7, "output": 3}, {}
+
+        monkeypatch.setattr(mod, "_usage_and_cost", fake_usage_and_cost)
+
+        class _Resp:
+            usage = {"prompt_tokens": 7, "completion_tokens": 3}
+
+        resp = _Resp()
+        mod.on_post_llm_call(
+            task_id="task-1",
+            session_id="session-1",
+            api_call_count=1,
+            model="gemini-3-flash-preview",
+            response=resp,
+            usage={"input_tokens": 999, "output_tokens": 999},
+            assistant_content_chars=42,
+        )
+
+        assert seen["resp"] is resp
+        assert captured["usage_details"] == {"input": 7, "output": 3}