fix(moa): advisory references end on a user turn + get a reference-role system prompt (#54007)

* fix(moa): reference advisory view must end with a user turn MoA reference calls failed with Anthropic models that don't support assistant prefill (e.g. Claude Opus 4.8): '400 ... must end with a user message'. The advisory view built by _reference_messages() kept the last assistant turn's text while dropping the following tool result, leaving a trailing assistant turn — which Anthropic (and OpenRouter->Anthropic) interpret as an assistant prefill to continue. References are advisory and must end on the user turn they answer. Strip trailing assistant turns from the advisory view (preserving intervening ones). Update the existing test that encoded the buggy shape and add a mid-tool-loop regression test. * feat(moa): give reference models an advisory-role system prompt Reference models received the bare trimmed conversation with no role framing, so they assumed they were the acting agent and refused ("I can't access repositories/URLs from here") or tried to call tools they don't have. Prepend a dedicated advisory system prompt to every reference call: the model is an analyst, not the actor — it cannot execute, should not apologize for lacking tools, and should reason about the presented state to advise the aggregator/orchestrator on approach, next steps, tool-use strategy, risks, and anything the acting agent missed. Its output is private guidance for the aggregator, not a user-facing answer.
2026-07-01 12:02:05 +00:00 · 2026-06-27 22:52:25 -07:00 · 2026-06-27 22:52:25 -07:00 · 1fa44180b0
commit 1fa44180b0
parent 2523917680
2 changed files with 129 additions and 6 deletions
--- a/agent/moa_loop.py
+++ b/agent/moa_loop.py
@ -26,6 +26,34 @@ logger = logging.getLogger(__name__)
 # opening dozens of sockets at once.
 _MAX_REFERENCE_WORKERS = 8

+# System prompt prepended to every reference-model call. References are
+# advisory — they do NOT act, call tools, or own the task. Without this
+# framing a reference receives the bare trimmed conversation and assumes it is
+# the acting agent: it then refuses ("I can't access repositories / URLs from
+# here") or tries to call tools it doesn't have. The prompt reframes the model
+# as an analyst whose job is to reason about the presented state and hand its
+# best thinking to the aggregator/orchestrator that will actually act.
+_REFERENCE_SYSTEM_PROMPT = (
+    "You are a reference advisor in a Mixture of Agents (MoA) process. You are "
+    "NOT the acting agent and you do NOT execute anything: you cannot call "
+    "tools, run commands, browse, or access files, repositories, or URLs, and "
+    "you should not try to or apologize for being unable to. A separate "
+    "aggregator/orchestrator model holds those capabilities and will take the "
+    "actual actions.\n\n"
+    "The conversation below is the current state of a task handled by that "
+    "acting agent. Your job is to give your most intelligent analysis of that "
+    "state: understand the goal, reason about the problem, and advise on what "
+    "to do next. Surface the best approach, concrete next steps and tool-use "
+    "strategy, likely pitfalls and risks, and anything the acting agent may "
+    "have missed or gotten wrong. Assume any referenced files, URLs, or "
+    "systems exist and reason about them from the context given rather than "
+    "asking for access.\n\n"
+    "Respond with your advice directly — no preamble, no disclaimers about "
+    "tools or access. Your response is private guidance handed to the "
+    "aggregator, not an answer shown to the user."
+)
+
+

 def _slot_label(slot: dict[str, str]) -> str:
    return f"{slot.get('provider', '').strip()}:{slot.get('model', '').strip()}"
@ -100,9 +128,14 @@ def _run_reference(
    """
    label = _slot_label(slot)
    try:
+        # Prepend the advisory-role system prompt so the reference understands
+        # it is analyzing state for an aggregator, not acting on the task. The
+        # trimmed view (_reference_messages) already strips the agent's own
+        # system prompt, so this is the only system message the reference sees.
+        messages = [{"role": "system", "content": _REFERENCE_SYSTEM_PROMPT}, *ref_messages]
        response = call_llm(
            task="moa_reference",
-            messages=ref_messages,
+            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens,
            **_slot_runtime(slot),
@ -169,6 +202,17 @@ def _reference_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
    that reject orphan ``tool`` messages or ``tool_calls`` the reference never
    produced. We keep only the user/assistant *text* turns, dropping the
    system prompt, any ``tool``-role messages, and any ``tool_calls`` payloads.
+
+    The trimmed view MUST end with a ``user`` turn. An advisory reference call
+    answers the latest user input; it must never end with an ``assistant``
+    turn, which Anthropic (and OpenRouter→Anthropic) interpret as an assistant
+    *prefill* the model should continue — some models (e.g. Claude Opus 4.8)
+    reject prefill outright with ``400 ... must end with a user message``. This
+    is the common mid-tool-loop case: the last assistant turn carried
+    interleaved reasoning text plus tool calls, so its text survives the trim
+    while the following ``tool`` result is dropped, leaving a trailing
+    assistant turn. We strip any trailing assistant turns so the reference sees
+    a user message last.
    """
    trimmed: list[dict[str, Any]] = []
    for msg in messages:
@ -186,9 +230,14 @@ def _reference_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
            # Assistant turn that was purely tool calls — nothing advisory.
            continue
        trimmed.append({"role": role, "content": text})
+    # Advisory calls must end on a user turn (no assistant prefill). Drop any
+    # trailing assistant turns left by the tool-loop trim above.
+    while trimmed and trimmed[-1].get("role") == "assistant":
+        trimmed.pop()
    if not trimmed:
-        # Degenerate case (e.g. first turn was stripped): fall back to a
-        # minimal user turn so the reference still has something to answer.
+        # Degenerate case (e.g. first turn was stripped, or the view trimmed
+        # down to assistant-only): fall back to the latest user turn so the
+        # reference still has something to answer.
        for msg in reversed(messages):
            if msg.get("role") == "user" and isinstance(msg.get("content"), str):
                return [{"role": "user", "content": msg["content"]}]
--- a/tests/run_agent/test_moa_loop_mode.py
+++ b/tests/run_agent/test_moa_loop_mode.py
@ -236,12 +236,81 @@ def test_reference_messages_strips_system_and_tool_history():
    # System prompt, tool-call-only assistant turn, and tool result are gone.
    assert all(m["role"] in ("user", "assistant") for m in trimmed)
    assert all("tool_calls" not in m for m in trimmed)
+    # The advisory view must end on a user turn — a trailing assistant turn is
+    # treated by Anthropic as an assistant prefill (400 on no-prefill models).
+    # The only kept user turn here is the prompt, so the trailing assistant
+    # answer is stripped.
    assert trimmed == [
        {"role": "user", "content": "do the thing"},
-        {"role": "assistant", "content": "here is my answer"},
    ]


+def test_reference_messages_ends_with_user_not_assistant_prefill():
+    """Advisory reference views must never end on an assistant turn.
+
+    Mid-tool-loop, the last assistant turn carries interleaved reasoning text
+    plus tool calls; its text survives the trim while the following tool result
+    is dropped, leaving a trailing assistant turn. Anthropic (and
+    OpenRouter→Anthropic) treat that as an assistant prefill the model should
+    continue, and no-prefill models (e.g. Claude Opus 4.8) reject it with
+    ``400 ... must end with a user message``. The trim must drop trailing
+    assistant turns while preserving intervening ones.
+    """
+    from agent.moa_loop import _reference_messages
+
+    messages = [
+        {"role": "user", "content": "q1"},
+        {"role": "assistant", "content": "a1"},
+        {"role": "user", "content": "q2 current"},
+        {
+            "role": "assistant",
+            "content": "let me reason then call a tool",
+            "tool_calls": [{"id": "c1", "function": {"name": "f", "arguments": "{}"}}],
+        },
+        {"role": "tool", "tool_call_id": "c1", "content": "tool result"},
+    ]
+
+    trimmed = _reference_messages(messages)
+
+    assert trimmed, "advisory view should not be empty"
+    assert trimmed[-1]["role"] == "user"
+    # Intervening assistant context is preserved; only the trailing one drops.
+    assert trimmed == [
+        {"role": "user", "content": "q1"},
+        {"role": "assistant", "content": "a1"},
+        {"role": "user", "content": "q2 current"},
+    ]
+
+
+def test_run_reference_prepends_advisory_system_prompt(monkeypatch):
+    """Each reference call gets the advisory-role system prompt first.
+
+    Without it the reference assumes it is the acting agent and refuses ("I
+    can't access repositories/URLs from here") or tries to call tools it
+    doesn't have. The system prompt reframes it as an analyst advising the
+    aggregator, and the advisory transcript still ends on a user turn.
+    """
+    from agent.moa_loop import _REFERENCE_SYSTEM_PROMPT, _run_reference
+
+    captured = {}
+
+    def fake_call_llm(**kwargs):
+        captured.update(kwargs)
+        return _response("advice")
+
+    monkeypatch.setattr("agent.moa_loop.call_llm", fake_call_llm)
+
+    label, text = _run_reference(
+        {"provider": "openai-codex", "model": "gpt-5.5"},
+        [{"role": "user", "content": "review this PR"}],
+    )
+
+    assert text == "advice"
+    msgs = captured["messages"]
+    assert msgs[0] == {"role": "system", "content": _REFERENCE_SYSTEM_PROMPT}
+    assert msgs[-1]["role"] == "user"
+
+
 def test_moa_facade_references_get_trimmed_messages(monkeypatch, tmp_path):
    home = tmp_path / ".hermes"
    home.mkdir()
@ -282,8 +351,13 @@ moa:
    )

    ref_call = next(c for c in calls if c["task"] == "moa_reference")
-    # Reference never sees system prompt or tool-role messages.
-    assert all(m["role"] == "user" for m in ref_call["messages"])
+    # Reference gets the advisory-role system prompt first, then user turns
+    # only — never the agent's own system prompt or tool-role messages.
+    ref_msgs = ref_call["messages"]
+    assert ref_msgs[0]["role"] == "system"
+    assert "reference advisor" in ref_msgs[0]["content"].lower()
+    assert "huge hermes system prompt" not in ref_msgs[0]["content"]
+    assert all(m["role"] == "user" for m in ref_msgs[1:])
    assert ref_call.get("tools") in (None, [])
    # Aggregator still receives the original messages + tool schema.
    agg_call = next(c for c in calls if c["task"] == "moa_aggregator")