mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-12 08:51:53 +00:00
fix(memory): flatten multimodal content before provider sync
Multimodal turns carry message content as a list of typed parts
({type: "text"|"image_url", ...}). _sync_external_memory_for_turn
passed that list straight into MemoryManager.sync_all, and providers
feed it to regexes — Honcho's sync_turn calls sanitize_context, where
re.sub raised 'expected string or bytes-like object, got list'. Every
turn with an attached image silently never synced.
Flatten to plain text at the boundary: text parts joined, images noted
as an [N image(s)] marker so the attachment isn't erased from recall.
Fixing here covers all providers instead of patching each plugin.
This commit is contained in:
parent
2ee69d0579
commit
705bdb6ffe
4 changed files with 161 additions and 4 deletions
|
|
@ -67,6 +67,45 @@ def sanitize_context(text: str) -> str:
|
|||
return text
|
||||
|
||||
|
||||
def flatten_message_content(content: Any) -> str:
|
||||
"""Flatten message content to plain text for memory providers.
|
||||
|
||||
Multimodal turns carry content as a list of ``{type: "text"|"image_url",
|
||||
...}`` parts; providers expect a string and feed it to regexes
|
||||
(``sanitize_context``) and text APIs, so a list crashes the sync
|
||||
(``expected string or bytes-like object, got 'list'``). Text parts are
|
||||
joined, images become a ``[N image(s)]`` marker so the turn isn't
|
||||
recorded as if the attachment never existed.
|
||||
"""
|
||||
if content is None:
|
||||
return ""
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
if isinstance(content, list):
|
||||
text_bits: List[str] = []
|
||||
image_count = 0
|
||||
for part in content:
|
||||
if isinstance(part, str):
|
||||
if part:
|
||||
text_bits.append(part)
|
||||
continue
|
||||
if not isinstance(part, dict):
|
||||
continue
|
||||
ptype = str(part.get("type") or "").strip().lower()
|
||||
if ptype in {"text", "input_text", "output_text"}:
|
||||
text = part.get("text")
|
||||
if isinstance(text, str) and text:
|
||||
text_bits.append(text)
|
||||
elif ptype in {"image_url", "input_image"}:
|
||||
image_count += 1
|
||||
flattened = "\n".join(text_bits).strip()
|
||||
if image_count:
|
||||
note = f"[{image_count} image{'s' if image_count != 1 else ''}]"
|
||||
flattened = f"{note} {flattened}" if flattened else note
|
||||
return flattened
|
||||
return str(content)
|
||||
|
||||
|
||||
class StreamingContextScrubber:
|
||||
"""Stateful scrubber for streaming text that may contain split memory-context spans.
|
||||
|
||||
|
|
|
|||
14
run_agent.py
14
run_agent.py
|
|
@ -132,7 +132,7 @@ from tools.browser_tool import cleanup_browser
|
|||
|
||||
|
||||
# Agent internals extracted to agent/ package for modularity
|
||||
from agent.memory_manager import sanitize_context
|
||||
from agent.memory_manager import flatten_message_content, sanitize_context
|
||||
from agent.error_classifier import FailoverReason
|
||||
from agent.redact import redact_sensitive_text
|
||||
from agent.model_metadata import (
|
||||
|
|
@ -2990,17 +2990,23 @@ class AIAgent:
|
|||
return
|
||||
if not (self._memory_manager and final_response and original_user_message):
|
||||
return
|
||||
# Multimodal turns carry content as a list of typed parts; providers
|
||||
# expect plain strings (see flatten_message_content).
|
||||
user_text = flatten_message_content(original_user_message)
|
||||
response_text = flatten_message_content(final_response)
|
||||
if not (user_text and response_text):
|
||||
return
|
||||
try:
|
||||
sync_kwargs = {"session_id": self.session_id or ""}
|
||||
if messages is not None:
|
||||
sync_kwargs["messages"] = messages
|
||||
self._memory_manager.sync_all(
|
||||
original_user_message,
|
||||
final_response,
|
||||
user_text,
|
||||
response_text,
|
||||
**sync_kwargs,
|
||||
)
|
||||
self._memory_manager.queue_prefetch_all(
|
||||
original_user_message,
|
||||
user_text,
|
||||
session_id=self.session_id or "",
|
||||
)
|
||||
except Exception:
|
||||
|
|
|
|||
|
|
@ -979,6 +979,67 @@ class TestMemoryContextFencing:
|
|||
assert combined.index("weather") < fence_start
|
||||
|
||||
|
||||
class TestFlattenMessageContent:
|
||||
"""Multimodal message content (list of typed parts) must flatten to a
|
||||
plain string before reaching providers — a raw list crashes their regex
|
||||
sanitization with ``expected string or bytes-like object, got 'list'``."""
|
||||
|
||||
def test_string_passthrough(self):
|
||||
from agent.memory_manager import flatten_message_content
|
||||
assert flatten_message_content("hello") == "hello"
|
||||
|
||||
def test_none_is_empty(self):
|
||||
from agent.memory_manager import flatten_message_content
|
||||
assert flatten_message_content(None) == ""
|
||||
|
||||
def test_text_parts_joined(self):
|
||||
from agent.memory_manager import flatten_message_content
|
||||
content = [
|
||||
{"type": "text", "text": "first"},
|
||||
{"type": "text", "text": "second"},
|
||||
]
|
||||
assert flatten_message_content(content) == "first\nsecond"
|
||||
|
||||
def test_image_part_becomes_marker(self):
|
||||
from agent.memory_manager import flatten_message_content
|
||||
content = [
|
||||
{"type": "text", "text": "look at this"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,xyz"}},
|
||||
]
|
||||
assert flatten_message_content(content) == "[1 image] look at this"
|
||||
|
||||
def test_image_only_message(self):
|
||||
from agent.memory_manager import flatten_message_content
|
||||
content = [
|
||||
{"type": "image_url", "image_url": {"url": "data:..."}},
|
||||
{"type": "image_url", "image_url": {"url": "data:..."}},
|
||||
]
|
||||
assert flatten_message_content(content) == "[2 images]"
|
||||
|
||||
def test_unknown_parts_skipped(self):
|
||||
from agent.memory_manager import flatten_message_content
|
||||
content = [{"type": "audio", "data": "..."}, {"type": "text", "text": "ok"}, 42]
|
||||
assert flatten_message_content(content) == "ok"
|
||||
|
||||
def test_bare_strings_in_list(self):
|
||||
from agent.memory_manager import flatten_message_content
|
||||
assert flatten_message_content(["plain", "strings"]) == "plain\nstrings"
|
||||
|
||||
def test_scalar_fallback(self):
|
||||
from agent.memory_manager import flatten_message_content
|
||||
assert flatten_message_content(42) == "42"
|
||||
|
||||
def test_flattened_output_is_regex_safe(self):
|
||||
"""The original failure: sanitize_context(list) raised TypeError."""
|
||||
from agent.memory_manager import flatten_message_content, sanitize_context
|
||||
content = [
|
||||
{"type": "text", "text": "fix this bug"},
|
||||
{"type": "image_url", "image_url": {"url": "data:..."}},
|
||||
]
|
||||
# Must not raise.
|
||||
assert sanitize_context(flatten_message_content(content))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# AIAgent.commit_memory_session — routes to MemoryManager.on_session_end
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -207,6 +207,57 @@ class TestSyncExternalMemoryForTurn:
|
|||
# sync_all still happened before the prefetch blew up.
|
||||
agent._memory_manager.sync_all.assert_called_once()
|
||||
|
||||
# --- Multimodal content flattening ----------------------------------
|
||||
|
||||
def test_multimodal_user_message_is_flattened(self):
|
||||
"""A turn with an attached image carries the user message as a
|
||||
list of typed parts. Providers feed the content to regexes
|
||||
(sanitize_context), so a raw list raised ``expected string or
|
||||
bytes-like object, got 'list'`` and the turn silently never
|
||||
synced. The boundary must flatten to text first."""
|
||||
agent = _bare_agent()
|
||||
agent._sync_external_memory_for_turn(
|
||||
original_user_message=[
|
||||
{"type": "text", "text": "what is in this screenshot?"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}},
|
||||
],
|
||||
final_response="A terminal window showing a stack trace.",
|
||||
interrupted=False,
|
||||
)
|
||||
agent._memory_manager.sync_all.assert_called_once_with(
|
||||
"[1 image] what is in this screenshot?",
|
||||
"A terminal window showing a stack trace.",
|
||||
session_id="test_session_001",
|
||||
)
|
||||
agent._memory_manager.queue_prefetch_all.assert_called_once_with(
|
||||
"[1 image] what is in this screenshot?",
|
||||
session_id="test_session_001",
|
||||
)
|
||||
|
||||
def test_multimodal_response_is_flattened(self):
|
||||
agent = _bare_agent()
|
||||
agent._sync_external_memory_for_turn(
|
||||
original_user_message="describe it",
|
||||
final_response=[{"type": "text", "text": "a cat"}],
|
||||
interrupted=False,
|
||||
)
|
||||
agent._memory_manager.sync_all.assert_called_once_with(
|
||||
"describe it", "a cat",
|
||||
session_id="test_session_001",
|
||||
)
|
||||
|
||||
def test_multimodal_with_no_text_at_all_skips(self):
|
||||
"""Unknown-typed parts flatten to an empty string — don't sync a
|
||||
turn with no recoverable text."""
|
||||
agent = _bare_agent()
|
||||
agent._sync_external_memory_for_turn(
|
||||
original_user_message=[{"type": "audio", "data": "..."}],
|
||||
final_response="noted",
|
||||
interrupted=False,
|
||||
)
|
||||
agent._memory_manager.sync_all.assert_not_called()
|
||||
agent._memory_manager.queue_prefetch_all.assert_not_called()
|
||||
|
||||
# --- The specific matrix the reporter asked about ------------------
|
||||
|
||||
@pytest.mark.parametrize("interrupted,final,user,expect_sync", [
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue