diff --git a/agent/moa_loop.py b/agent/moa_loop.py index ed566325eeb..5fe984e7fd2 100644 --- a/agent/moa_loop.py +++ b/agent/moa_loop.py @@ -26,6 +26,34 @@ logger = logging.getLogger(__name__) # opening dozens of sockets at once. _MAX_REFERENCE_WORKERS = 8 +# System prompt prepended to every reference-model call. References are +# advisory — they do NOT act, call tools, or own the task. Without this +# framing a reference receives the bare trimmed conversation and assumes it is +# the acting agent: it then refuses ("I can't access repositories / URLs from +# here") or tries to call tools it doesn't have. The prompt reframes the model +# as an analyst whose job is to reason about the presented state and hand its +# best thinking to the aggregator/orchestrator that will actually act. +_REFERENCE_SYSTEM_PROMPT = ( + "You are a reference advisor in a Mixture of Agents (MoA) process. You are " + "NOT the acting agent and you do NOT execute anything: you cannot call " + "tools, run commands, browse, or access files, repositories, or URLs, and " + "you should not try to or apologize for being unable to. A separate " + "aggregator/orchestrator model holds those capabilities and will take the " + "actual actions.\n\n" + "The conversation below is the current state of a task handled by that " + "acting agent. Your job is to give your most intelligent analysis of that " + "state: understand the goal, reason about the problem, and advise on what " + "to do next. Surface the best approach, concrete next steps and tool-use " + "strategy, likely pitfalls and risks, and anything the acting agent may " + "have missed or gotten wrong. Assume any referenced files, URLs, or " + "systems exist and reason about them from the context given rather than " + "asking for access.\n\n" + "Respond with your advice directly — no preamble, no disclaimers about " + "tools or access. Your response is private guidance handed to the " + "aggregator, not an answer shown to the user." +) + + def _slot_label(slot: dict[str, str]) -> str: return f"{slot.get('provider', '').strip()}:{slot.get('model', '').strip()}" @@ -100,9 +128,14 @@ def _run_reference( """ label = _slot_label(slot) try: + # Prepend the advisory-role system prompt so the reference understands + # it is analyzing state for an aggregator, not acting on the task. The + # trimmed view (_reference_messages) already strips the agent's own + # system prompt, so this is the only system message the reference sees. + messages = [{"role": "system", "content": _REFERENCE_SYSTEM_PROMPT}, *ref_messages] response = call_llm( task="moa_reference", - messages=ref_messages, + messages=messages, temperature=temperature, max_tokens=max_tokens, **_slot_runtime(slot), @@ -169,6 +202,17 @@ def _reference_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: that reject orphan ``tool`` messages or ``tool_calls`` the reference never produced. We keep only the user/assistant *text* turns, dropping the system prompt, any ``tool``-role messages, and any ``tool_calls`` payloads. + + The trimmed view MUST end with a ``user`` turn. An advisory reference call + answers the latest user input; it must never end with an ``assistant`` + turn, which Anthropic (and OpenRouter→Anthropic) interpret as an assistant + *prefill* the model should continue — some models (e.g. Claude Opus 4.8) + reject prefill outright with ``400 ... must end with a user message``. This + is the common mid-tool-loop case: the last assistant turn carried + interleaved reasoning text plus tool calls, so its text survives the trim + while the following ``tool`` result is dropped, leaving a trailing + assistant turn. We strip any trailing assistant turns so the reference sees + a user message last. """ trimmed: list[dict[str, Any]] = [] for msg in messages: @@ -186,9 +230,14 @@ def _reference_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: # Assistant turn that was purely tool calls — nothing advisory. continue trimmed.append({"role": role, "content": text}) + # Advisory calls must end on a user turn (no assistant prefill). Drop any + # trailing assistant turns left by the tool-loop trim above. + while trimmed and trimmed[-1].get("role") == "assistant": + trimmed.pop() if not trimmed: - # Degenerate case (e.g. first turn was stripped): fall back to a - # minimal user turn so the reference still has something to answer. + # Degenerate case (e.g. first turn was stripped, or the view trimmed + # down to assistant-only): fall back to the latest user turn so the + # reference still has something to answer. for msg in reversed(messages): if msg.get("role") == "user" and isinstance(msg.get("content"), str): return [{"role": "user", "content": msg["content"]}] diff --git a/tests/run_agent/test_moa_loop_mode.py b/tests/run_agent/test_moa_loop_mode.py index c05dd3b267f..b07a8156281 100644 --- a/tests/run_agent/test_moa_loop_mode.py +++ b/tests/run_agent/test_moa_loop_mode.py @@ -236,12 +236,81 @@ def test_reference_messages_strips_system_and_tool_history(): # System prompt, tool-call-only assistant turn, and tool result are gone. assert all(m["role"] in ("user", "assistant") for m in trimmed) assert all("tool_calls" not in m for m in trimmed) + # The advisory view must end on a user turn — a trailing assistant turn is + # treated by Anthropic as an assistant prefill (400 on no-prefill models). + # The only kept user turn here is the prompt, so the trailing assistant + # answer is stripped. assert trimmed == [ {"role": "user", "content": "do the thing"}, - {"role": "assistant", "content": "here is my answer"}, ] +def test_reference_messages_ends_with_user_not_assistant_prefill(): + """Advisory reference views must never end on an assistant turn. + + Mid-tool-loop, the last assistant turn carries interleaved reasoning text + plus tool calls; its text survives the trim while the following tool result + is dropped, leaving a trailing assistant turn. Anthropic (and + OpenRouter→Anthropic) treat that as an assistant prefill the model should + continue, and no-prefill models (e.g. Claude Opus 4.8) reject it with + ``400 ... must end with a user message``. The trim must drop trailing + assistant turns while preserving intervening ones. + """ + from agent.moa_loop import _reference_messages + + messages = [ + {"role": "user", "content": "q1"}, + {"role": "assistant", "content": "a1"}, + {"role": "user", "content": "q2 current"}, + { + "role": "assistant", + "content": "let me reason then call a tool", + "tool_calls": [{"id": "c1", "function": {"name": "f", "arguments": "{}"}}], + }, + {"role": "tool", "tool_call_id": "c1", "content": "tool result"}, + ] + + trimmed = _reference_messages(messages) + + assert trimmed, "advisory view should not be empty" + assert trimmed[-1]["role"] == "user" + # Intervening assistant context is preserved; only the trailing one drops. + assert trimmed == [ + {"role": "user", "content": "q1"}, + {"role": "assistant", "content": "a1"}, + {"role": "user", "content": "q2 current"}, + ] + + +def test_run_reference_prepends_advisory_system_prompt(monkeypatch): + """Each reference call gets the advisory-role system prompt first. + + Without it the reference assumes it is the acting agent and refuses ("I + can't access repositories/URLs from here") or tries to call tools it + doesn't have. The system prompt reframes it as an analyst advising the + aggregator, and the advisory transcript still ends on a user turn. + """ + from agent.moa_loop import _REFERENCE_SYSTEM_PROMPT, _run_reference + + captured = {} + + def fake_call_llm(**kwargs): + captured.update(kwargs) + return _response("advice") + + monkeypatch.setattr("agent.moa_loop.call_llm", fake_call_llm) + + label, text = _run_reference( + {"provider": "openai-codex", "model": "gpt-5.5"}, + [{"role": "user", "content": "review this PR"}], + ) + + assert text == "advice" + msgs = captured["messages"] + assert msgs[0] == {"role": "system", "content": _REFERENCE_SYSTEM_PROMPT} + assert msgs[-1]["role"] == "user" + + def test_moa_facade_references_get_trimmed_messages(monkeypatch, tmp_path): home = tmp_path / ".hermes" home.mkdir() @@ -282,8 +351,13 @@ moa: ) ref_call = next(c for c in calls if c["task"] == "moa_reference") - # Reference never sees system prompt or tool-role messages. - assert all(m["role"] == "user" for m in ref_call["messages"]) + # Reference gets the advisory-role system prompt first, then user turns + # only — never the agent's own system prompt or tool-role messages. + ref_msgs = ref_call["messages"] + assert ref_msgs[0]["role"] == "system" + assert "reference advisor" in ref_msgs[0]["content"].lower() + assert "huge hermes system prompt" not in ref_msgs[0]["content"] + assert all(m["role"] == "user" for m in ref_msgs[1:]) assert ref_call.get("tools") in (None, []) # Aggregator still receives the original messages + tool schema. agg_call = next(c for c in calls if c["task"] == "moa_aggregator")