From 9f1c16a7fbb413d6e7d41802052fb225d6b4d8bf Mon Sep 17 00:00:00 2001 From: kamonspecial <10783249+kamonspecial@users.noreply.github.com> Date: Sun, 7 Jun 2026 00:06:39 +0900 Subject: [PATCH] fix(langfuse): restore usage/cost when post_api_request sends a sanitized response on_post_llm_call extracted usage via `if response is not None:`, taking the response-object path. But post_api_request delivers `response` as a sanitized dict (no `.usage` attribute) alongside a separate `usage` summary dict, so `getattr(response, "usage")` was always None and token/cost data was dropped for every gateway turn (traces showed usage 0 / cost 0). Gate on a real `.usage` attribute so the existing usage-dict fallback is reached. Real response objects (post_llm_call / legacy) still take the response-object path. Adds regression tests for both paths. --- plugins/observability/langfuse/__init__.py | 12 +++- tests/plugins/test_langfuse_plugin.py | 73 ++++++++++++++++++++++ 2 files changed, 83 insertions(+), 2 deletions(-) diff --git a/plugins/observability/langfuse/__init__.py b/plugins/observability/langfuse/__init__.py index 8516030fb01..a18ebf98fc9 100644 --- a/plugins/observability/langfuse/__init__.py +++ b/plugins/observability/langfuse/__init__.py @@ -837,8 +837,16 @@ def on_post_llm_call(*, task_id: str = "", session_id: str = "", provider: str = if output.get("tool_calls"): state.turn_tool_calls.extend(output["tool_calls"]) - # Extract usage: prefer response object, fall back to usage dict from post_api_request - if response is not None: + # Extract usage: prefer a real response object that carries usage, else + # fall back to the usage summary dict from post_api_request. + # + # post_api_request passes `response` as a SANITIZED dict (no ``.usage`` + # attribute) alongside a separate `usage` summary dict. Gating on + # ``response is not None`` here took the response-object path on that dict, + # where ``getattr(response, "usage", None)`` is always None — so usage and + # cost were silently dropped for every gateway turn. Gate on a real + # ``.usage`` attribute instead so the usage-dict fallback below is reached. + if getattr(response, "usage", None) is not None: usage_details, cost_details = _usage_and_cost( response, provider=provider, diff --git a/tests/plugins/test_langfuse_plugin.py b/tests/plugins/test_langfuse_plugin.py index 313d2e94a72..51c8c3f4635 100644 --- a/tests/plugins/test_langfuse_plugin.py +++ b/tests/plugins/test_langfuse_plugin.py @@ -704,3 +704,76 @@ class TestToolObservationKeying: assert ended["output"] == {"status": "done"} assert not state.tools + +class TestUsageFromSanitizedResponse: + """Regression: ``post_api_request`` delivers ``response`` as a sanitized + dict (no ``.usage`` attribute) plus a separate ``usage`` summary dict. The + post-call handler must read the ``usage`` dict instead of treating the dict + response as a usage-bearing object and dropping all token/cost data.""" + + def _setup(self, mod, monkeypatch): + # Active client so on_post_llm_call does not early-return. + monkeypatch.setattr(mod, "_get_langfuse", lambda: object()) + observation = object() + state = mod.TraceState(trace_id="trace-1", root_ctx=None, root_span=None) + state.generations[mod._request_key(1)] = observation + monkeypatch.setitem(mod._TRACE_STATE, mod._trace_key("task-1", "session-1"), state) + captured = {} + + def fake_end_observation(obs, *, output=None, metadata=None, usage_details=None, cost_details=None): + captured["usage_details"] = usage_details + + monkeypatch.setattr(mod, "_end_observation", fake_end_observation) + return captured + + def test_sanitized_dict_response_uses_usage_dict(self, monkeypatch): + sys.modules.pop("plugins.observability.langfuse", None) + mod = importlib.import_module("plugins.observability.langfuse") + captured = self._setup(mod, monkeypatch) + + # A plain dict has no ``.usage`` attribute — mirrors post_api_request. + mod.on_post_llm_call( + task_id="task-1", + session_id="session-1", + api_call_count=1, + model="gemini-3-flash-preview", + response={"model": "gemini-3-flash-preview", "usage": {"input_tokens": 100, "output_tokens": 20}}, + usage={"input_tokens": 100, "output_tokens": 20}, + assistant_content_chars=42, + ) + + # Before the fix the dict response shadowed the usage dict and tokens + # were lost (usage_details == {}). + assert captured["usage_details"] == {"input": 100, "output": 20} + + def test_real_response_object_with_usage_still_used(self, monkeypatch): + sys.modules.pop("plugins.observability.langfuse", None) + mod = importlib.import_module("plugins.observability.langfuse") + captured = self._setup(mod, monkeypatch) + + # A response object that genuinely carries usage must still take the + # response-object path (post_llm_call / legacy behavior). + seen = {} + + def fake_usage_and_cost(resp, **_): + seen["resp"] = resp + return {"input": 7, "output": 3}, {} + + monkeypatch.setattr(mod, "_usage_and_cost", fake_usage_and_cost) + + class _Resp: + usage = {"prompt_tokens": 7, "completion_tokens": 3} + + resp = _Resp() + mod.on_post_llm_call( + task_id="task-1", + session_id="session-1", + api_call_count=1, + model="gemini-3-flash-preview", + response=resp, + usage={"input_tokens": 999, "output_tokens": 999}, + assistant_content_chars=42, + ) + + assert seen["resp"] is resp + assert captured["usage_details"] == {"input": 7, "output": 3}