Merge pull request #40560 from kamonspecial/fix/langfuse-usage-sanitized-response

fix(langfuse): restore usage/cost when post_api_request sends a sanitized response
2026-07-24 16:54:43 +00:00 · 2026-06-08 15:04:37 -07:00 · 2026-06-08 15:04:37 -07:00 · 1e3b3dfabb
commit 1e3b3dfabb
parent 09a6a2ddd7 9f1c16a7fb
2 changed files with 83 additions and 2 deletions
--- a/plugins/observability/langfuse/init.py
+++ b/plugins/observability/langfuse/init.py
@ -837,8 +837,16 @@ def on_post_llm_call(*, task_id: str = "", session_id: str = "", provider: str =
    if output.get("tool_calls"):
        state.turn_tool_calls.extend(output["tool_calls"])

-    # Extract usage: prefer response object, fall back to usage dict from post_api_request
-    if response is not None:
+    # Extract usage: prefer a real response object that carries usage, else
+    # fall back to the usage summary dict from post_api_request.
+    #
+    # post_api_request passes `response` as a SANITIZED dict (no ``.usage``
+    # attribute) alongside a separate `usage` summary dict. Gating on
+    # ``response is not None`` here took the response-object path on that dict,
+    # where ``getattr(response, "usage", None)`` is always None — so usage and
+    # cost were silently dropped for every gateway turn. Gate on a real
+    # ``.usage`` attribute instead so the usage-dict fallback below is reached.
+    if getattr(response, "usage", None) is not None:
        usage_details, cost_details = _usage_and_cost(
            response,
            provider=provider,
--- a/tests/plugins/test_langfuse_plugin.py
+++ b/tests/plugins/test_langfuse_plugin.py
@ -704,3 +704,76 @@ class TestToolObservationKeying:
        assert ended["output"] == {"status": "done"}
        assert not state.tools

+
+class TestUsageFromSanitizedResponse:
+    """Regression: ``post_api_request`` delivers ``response`` as a sanitized
+    dict (no ``.usage`` attribute) plus a separate ``usage`` summary dict. The
+    post-call handler must read the ``usage`` dict instead of treating the dict
+    response as a usage-bearing object and dropping all token/cost data."""
+
+    def _setup(self, mod, monkeypatch):
+        # Active client so on_post_llm_call does not early-return.
+        monkeypatch.setattr(mod, "_get_langfuse", lambda: object())
+        observation = object()
+        state = mod.TraceState(trace_id="trace-1", root_ctx=None, root_span=None)
+        state.generations[mod._request_key(1)] = observation
+        monkeypatch.setitem(mod._TRACE_STATE, mod._trace_key("task-1", "session-1"), state)
+        captured = {}
+
+        def fake_end_observation(obs, *, output=None, metadata=None, usage_details=None, cost_details=None):
+            captured["usage_details"] = usage_details
+
+        monkeypatch.setattr(mod, "_end_observation", fake_end_observation)
+        return captured
+
+    def test_sanitized_dict_response_uses_usage_dict(self, monkeypatch):
+        sys.modules.pop("plugins.observability.langfuse", None)
+        mod = importlib.import_module("plugins.observability.langfuse")
+        captured = self._setup(mod, monkeypatch)
+
+        # A plain dict has no ``.usage`` attribute — mirrors post_api_request.
+        mod.on_post_llm_call(
+            task_id="task-1",
+            session_id="session-1",
+            api_call_count=1,
+            model="gemini-3-flash-preview",
+            response={"model": "gemini-3-flash-preview", "usage": {"input_tokens": 100, "output_tokens": 20}},
+            usage={"input_tokens": 100, "output_tokens": 20},
+            assistant_content_chars=42,
+        )
+
+        # Before the fix the dict response shadowed the usage dict and tokens
+        # were lost (usage_details == {}).
+        assert captured["usage_details"] == {"input": 100, "output": 20}
+
+    def test_real_response_object_with_usage_still_used(self, monkeypatch):
+        sys.modules.pop("plugins.observability.langfuse", None)
+        mod = importlib.import_module("plugins.observability.langfuse")
+        captured = self._setup(mod, monkeypatch)
+
+        # A response object that genuinely carries usage must still take the
+        # response-object path (post_llm_call / legacy behavior).
+        seen = {}
+
+        def fake_usage_and_cost(resp, **_):
+            seen["resp"] = resp
+            return {"input": 7, "output": 3}, {}
+
+        monkeypatch.setattr(mod, "_usage_and_cost", fake_usage_and_cost)
+
+        class _Resp:
+            usage = {"prompt_tokens": 7, "completion_tokens": 3}
+
+        resp = _Resp()
+        mod.on_post_llm_call(
+            task_id="task-1",
+            session_id="session-1",
+            api_call_count=1,
+            model="gemini-3-flash-preview",
+            response=resp,
+            usage={"input_tokens": 999, "output_tokens": 999},
+            assistant_content_chars=42,
+        )
+
+        assert seen["resp"] is resp
+        assert captured["usage_details"] == {"input": 7, "output": 3}