diff --git a/plugins/observability/langfuse/__init__.py b/plugins/observability/langfuse/__init__.py index a18ebf98fc9..b992484b05e 100644 --- a/plugins/observability/langfuse/__init__.py +++ b/plugins/observability/langfuse/__init__.py @@ -227,7 +227,30 @@ def _trace_key(task_id: str, session_id: str) -> str: return f"thread:{threading.get_ident()}" -def _truncate_text(value: str, max_chars: int) -> str: +def _is_base64_data_uri(value: str) -> bool: + prefix = value[:200].lower() + return prefix.startswith("data:") and ";base64," in prefix + + +def _redact_data_uri(value: str) -> dict[str, Any]: + header = value.split(",", 1)[0] if "," in value else "data:" + media_type = header[5:].split(";", 1)[0] if header.startswith("data:") else "" + return { + "type": "data_uri", + "media_type": media_type or None, + "omitted": True, + "length": len(value), + } + + +def _truncate_text(value: str, max_chars: int) -> Any: + # Langfuse SDK treats data:*;base64 strings as media and attempts to + # decode them. Truncating those strings produces invalid base64 and noisy + # "Error parsing base64 data URI" logs. Observability only needs metadata, + # not raw image/audio payloads, so redact the whole data URI before it + # reaches the SDK. + if _is_base64_data_uri(value): + return _redact_data_uri(value) if len(value) <= max_chars: return value return value[:max_chars] + f"... [truncated {len(value) - max_chars} chars]" diff --git a/tests/plugins/test_langfuse_plugin.py b/tests/plugins/test_langfuse_plugin.py index 51c8c3f4635..ca91feae613 100644 --- a/tests/plugins/test_langfuse_plugin.py +++ b/tests/plugins/test_langfuse_plugin.py @@ -171,6 +171,40 @@ class TestHooksInert: mod.on_post_tool_call(tool_name="read_file", args={}, result="ok", task_id="t", session_id="s") +class TestPayloadSanitization: + def test_safe_value_redacts_base64_data_uri_instead_of_truncating(self): + sys.modules.pop("plugins.observability.langfuse", None) + import importlib + mod = importlib.import_module("plugins.observability.langfuse") + + payload = "data:image/png;base64," + ("a" * 20000) + result = mod._safe_value(payload) + + assert result == { + "type": "data_uri", + "media_type": "image/png", + "omitted": True, + "length": len(payload), + } + + def test_serialize_messages_redacts_data_uri_parts(self): + sys.modules.pop("plugins.observability.langfuse", None) + import importlib + mod = importlib.import_module("plugins.observability.langfuse") + + payload = "data:image/jpeg;base64," + ("b" * 20000) + serialized = mod._serialize_messages([ + {"role": "user", "content": [{"type": "image_url", "image_url": {"url": payload}}]} + ]) + + assert serialized[0]["content"][0]["image_url"]["url"] == { + "type": "data_uri", + "media_type": "image/jpeg", + "omitted": True, + "length": len(payload), + } + + # --------------------------------------------------------------------------- # Placeholder-credential guard (#23823). #