fix(transport): strip Hermes-internal scaffolding keys before chat.completions

The empty-response recovery path in run_agent.py appends synthetic messages tagged with _empty_recovery_synthetic (and the agent loop uses _thinking_prefill / _empty_terminal_sentinel similarly). These are internal bookkeeping markers — they must never reach the wire. chat_completions' convert_messages only stripped Codex Responses leak fields (codex_reasoning_items, call_id, etc.), not these _-prefixed markers. Permissive providers (real OpenAI, Anthropic) silently ignore unknown message keys so the bug stayed hidden, but strict OpenAI-compatible gateways reject them outright. Observed against codex.nekos.me: 502: [ObjectParam] [input[617]._empty_recovery_synthetic] [unknown_parameter] Unknown parameter: '_empty_recovery_synthetic' Because the synthetic messages persist in the session, every subsequent request in that session carries the poisoned key and fails identically — a deterministic 502 the retry loop mistakes for a transient server error. Fix: convert_messages now drops any top-level message key starting with '_'. OpenAI's message schema has no '_'-prefixed fields, so this is safe and future-proofs against new internal markers. Origin: local-author Upstream-PR: none Patch-State: local-only
2026-07-13 14:02:16 +00:00 · 2026-05-15 11:26:47 +09:00 · 2026-05-15 11:26:47 +09:00 · 775a17284f
commit 775a17284f
parent 7ab1677362
2 changed files with 52 additions and 3 deletions
--- a/agent/transports/chat_completions.py
+++ b/agent/transports/chat_completions.py
@ -113,9 +113,8 @@ class ChatCompletionsTransport(ProviderTransport):
        self, messages: list[dict[str, Any]], **kwargs
    ) -> list[dict[str, Any]]:
        """Messages are already in OpenAI format — strip internal fields
-        that strict chat-completions providers reject with HTTP 400/422.
-
-        Strips:
+        that strict chat-completions providers reject with HTTP 400/422
+        (or, in the case of some OpenAI-compatible gateways, 5xx):

        - Codex Responses API fields: ``codex_reasoning_items`` /
          ``codex_message_items`` on the message, ``call_id`` /
@ -127,6 +126,16 @@ class ChatCompletionsTransport(ProviderTransport):
          ``Extra inputs are not permitted, field: 'messages[N].tool_name'``.
          Permissive providers (OpenRouter, MiniMax) silently ignore the
          field, which masked the bug for months.
+        - Hermes-internal scaffolding markers — any top-level message key
+          starting with ``_`` (e.g. ``_empty_recovery_synthetic``,
+          ``_empty_terminal_sentinel``, ``_thinking_prefill``). These are
+          bookkeeping flags the agent loop attaches to messages so the
+          persistence layer can later strip its own scaffolding; they must
+          never reach the wire. Permissive providers (real OpenAI,
+          Anthropic) silently drop unknown message keys, but strict
+          gateways (e.g. opencode-go, codex.nekos.me) reject with
+          ``Extra inputs are not permitted, field: 'messages[N]._empty_recovery_synthetic'``,
+          which then poisons every subsequent request in the session.
        """
        needs_sanitize = False
        for msg in messages:
@ -139,6 +148,9 @@ class ChatCompletionsTransport(ProviderTransport):
            ):
                needs_sanitize = True
                break
+            if any(isinstance(k, str) and k.startswith("_") for k in msg):
+                needs_sanitize = True
+                break
            tool_calls = msg.get("tool_calls")
            if isinstance(tool_calls, list):
                for tc in tool_calls:
@ -160,6 +172,11 @@ class ChatCompletionsTransport(ProviderTransport):
            msg.pop("codex_reasoning_items", None)
            msg.pop("codex_message_items", None)
            msg.pop("tool_name", None)
+            # Drop all Hermes-internal scaffolding markers (``_``-prefixed).
+            # OpenAI's message schema has no ``_``-prefixed fields, so this
+            # is safe and future-proofs against new markers being added.
+            for key in [k for k in msg if isinstance(k, str) and k.startswith("_")]:
+                msg.pop(key, None)
            tool_calls = msg.get("tool_calls")
            if isinstance(tool_calls, list):
                for tc in tool_calls:
--- a/tests/agent/transports/test_chat_completions.py
+++ b/tests/agent/transports/test_chat_completions.py
@ -66,6 +66,38 @@ class TestChatCompletionsBasic:
        # Original list untouched (deepcopy-on-demand)
        assert msgs[2]["tool_name"] == "execute_code"

+    def test_convert_messages_strips_internal_scaffolding_markers(self, transport):
+        """Hermes-internal ``_``-prefixed markers must never reach the wire.
+
+        The empty-response recovery path appends synthetic messages tagged
+        with ``_empty_recovery_synthetic``; permissive providers ignore the
+        unknown key, but strict gateways (opencode-go, codex.nekos.me)
+        reject the request, poisoning every later turn in the session.
+        """
+        msgs = [
+            {"role": "user", "content": "run the task"},
+            {"role": "assistant", "content": "(empty)", "_empty_recovery_synthetic": True},
+            {"role": "user", "content": "continue", "_empty_recovery_synthetic": True},
+            {"role": "assistant", "content": "done", "_thinking_prefill": True,
+             "_empty_terminal_sentinel": True},
+        ]
+        result = transport.convert_messages(msgs)
+        for m in result:
+            assert not any(k.startswith("_") for k in m), m
+        # Visible content preserved
+        assert result[1]["content"] == "(empty)"
+        assert result[2]["content"] == "continue"
+        # Original list untouched (deepcopy-on-demand)
+        assert msgs[1]["_empty_recovery_synthetic"] is True
+
+    def test_convert_messages_clean_list_is_identity(self, transport):
+        """A list with no internal/codex keys is returned as-is (no copy)."""
+        msgs = [
+            {"role": "user", "content": "hi"},
+            {"role": "assistant", "content": "hello"},
+        ]
+        assert transport.convert_messages(msgs) is msgs
+

 class TestChatCompletionsBuildKwargs: