From 46427622894025dee21862a6a7e5ab5f92c313f3 Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Wed, 10 Jun 2026 10:49:36 +0530 Subject: [PATCH] fix(langfuse): redact base64 data URIs instead of truncating into invalid base64 The Langfuse SDK treats `data:*;base64,...` strings as media and tries to decode them. `_truncate_text` was slicing those strings mid-payload, producing invalid base64 and noisy "Error parsing base64 data URI" logs. Observability only needs the metadata, not raw image/audio bytes, so redact the whole data URI (type, media_type, length) before it reaches the SDK. Salvaged the Langfuse fix from #39682 onto current main as a standalone, single-concern change (the dashboard `dist/**` and plugin-discovery parts of that PR already landed separately on main). Co-authored-by: foras910521-lab --- plugins/observability/langfuse/__init__.py | 25 +++++++++++++++- tests/plugins/test_langfuse_plugin.py | 34 ++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/plugins/observability/langfuse/__init__.py b/plugins/observability/langfuse/__init__.py index a18ebf98fc9..b992484b05e 100644 --- a/plugins/observability/langfuse/__init__.py +++ b/plugins/observability/langfuse/__init__.py @@ -227,7 +227,30 @@ def _trace_key(task_id: str, session_id: str) -> str: return f"thread:{threading.get_ident()}" -def _truncate_text(value: str, max_chars: int) -> str: +def _is_base64_data_uri(value: str) -> bool: + prefix = value[:200].lower() + return prefix.startswith("data:") and ";base64," in prefix + + +def _redact_data_uri(value: str) -> dict[str, Any]: + header = value.split(",", 1)[0] if "," in value else "data:" + media_type = header[5:].split(";", 1)[0] if header.startswith("data:") else "" + return { + "type": "data_uri", + "media_type": media_type or None, + "omitted": True, + "length": len(value), + } + + +def _truncate_text(value: str, max_chars: int) -> Any: + # Langfuse SDK treats data:*;base64 strings as media and attempts to + # decode them. Truncating those strings produces invalid base64 and noisy + # "Error parsing base64 data URI" logs. Observability only needs metadata, + # not raw image/audio payloads, so redact the whole data URI before it + # reaches the SDK. + if _is_base64_data_uri(value): + return _redact_data_uri(value) if len(value) <= max_chars: return value return value[:max_chars] + f"... [truncated {len(value) - max_chars} chars]" diff --git a/tests/plugins/test_langfuse_plugin.py b/tests/plugins/test_langfuse_plugin.py index 51c8c3f4635..ca91feae613 100644 --- a/tests/plugins/test_langfuse_plugin.py +++ b/tests/plugins/test_langfuse_plugin.py @@ -171,6 +171,40 @@ class TestHooksInert: mod.on_post_tool_call(tool_name="read_file", args={}, result="ok", task_id="t", session_id="s") +class TestPayloadSanitization: + def test_safe_value_redacts_base64_data_uri_instead_of_truncating(self): + sys.modules.pop("plugins.observability.langfuse", None) + import importlib + mod = importlib.import_module("plugins.observability.langfuse") + + payload = "data:image/png;base64," + ("a" * 20000) + result = mod._safe_value(payload) + + assert result == { + "type": "data_uri", + "media_type": "image/png", + "omitted": True, + "length": len(payload), + } + + def test_serialize_messages_redacts_data_uri_parts(self): + sys.modules.pop("plugins.observability.langfuse", None) + import importlib + mod = importlib.import_module("plugins.observability.langfuse") + + payload = "data:image/jpeg;base64," + ("b" * 20000) + serialized = mod._serialize_messages([ + {"role": "user", "content": [{"type": "image_url", "image_url": {"url": payload}}]} + ]) + + assert serialized[0]["content"][0]["image_url"]["url"] == { + "type": "data_uri", + "media_type": "image/jpeg", + "omitted": True, + "length": len(payload), + } + + # --------------------------------------------------------------------------- # Placeholder-credential guard (#23823). #