diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index ce2ff16cb0a..8503a7f90f9 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -828,7 +828,6 @@ def run_conversation( aggregator=moa_config.get("aggregator") or {}, temperature=float(moa_config.get("reference_temperature", 0.6) or 0.6), aggregator_temperature=float(moa_config.get("aggregator_temperature", 0.4) or 0.4), - max_tokens=int(moa_config.get("max_tokens", 4096) or 4096), ) if _moa_context: for _msg in reversed(api_messages): diff --git a/agent/moa_loop.py b/agent/moa_loop.py index f908c70a0a5..34a4c551dca 100644 --- a/agent/moa_loop.py +++ b/agent/moa_loop.py @@ -30,15 +30,61 @@ def _slot_label(slot: dict[str, str]) -> str: return f"{slot.get('provider', '').strip()}:{slot.get('model', '').strip()}" +def _slot_runtime(slot: dict[str, str]) -> dict[str, Any]: + """Resolve a reference/aggregator slot to real runtime call kwargs. + + A MoA slot is just a model selection — it must be called the same way any + model is called elsewhere, not through a bare ``call_llm(provider=..., + model=...)`` that leaves base_url/api_key/api_mode unresolved and lets the + auxiliary auto-detector guess. We route the slot's provider through + ``resolve_runtime_provider`` (the canonical provider→api_mode/base_url/ + api_key resolver the CLI, gateway, and delegate_task all use), so the slot + gets its provider's real API surface — e.g. MiniMax → anthropic_messages, + GPT-5/o-series → max_completion_tokens, custom endpoints → their base_url. + + Returns the kwargs to pass through to ``call_llm`` (provider/model plus the + resolved base_url/api_key when available). Falls back to the bare + provider/model on any resolution error so a misconfigured slot still + attempts the call rather than aborting the whole MoA turn. + """ + provider = str(slot.get("provider") or "").strip() + model = str(slot.get("model") or "").strip() + out: dict[str, Any] = {"provider": provider, "model": model} + try: + from hermes_cli.runtime_provider import resolve_runtime_provider + + rt = resolve_runtime_provider(requested=provider, target_model=model) + # Pass the resolved endpoint through so call_llm builds the request for + # the provider's actual API surface instead of auto-detecting. base_url + # routes call_llm to the right adapter (incl. anthropic_messages mode); + # api_key is the resolved credential for that provider. + if rt.get("base_url"): + out["base_url"] = rt["base_url"] + if rt.get("api_key"): + out["api_key"] = rt["api_key"] + except Exception as exc: # pragma: no cover - defensive + logger.debug("MoA slot runtime resolution failed for %s: %s", _slot_label(slot), exc) + return out + + def _run_reference( slot: dict[str, str], ref_messages: list[dict[str, Any]], *, - temperature: float, - max_tokens: int, + temperature: float | None = None, + max_tokens: int | None = None, ) -> tuple[str, str]: """Call one reference model and return ``(label, text)``. + The slot is resolved to its provider's real runtime (via ``_slot_runtime``) + and called through the same ``call_llm`` request-building path any model + uses, so per-model wire-format handling (anthropic_messages, + max_completion_tokens, fixed/forbidden temperature) applies identically to + a reference as it would if that model were the acting model. MoA imposes no + cap of its own (``max_tokens`` defaults to ``None`` → omitted → the model's + real maximum); ``temperature`` is only the user's configured preset value, + which call_llm may still override per model. + Never raises: a failed reference becomes a labelled note so the aggregator can still act with partial context. Designed to run inside a thread pool — ``call_llm`` is synchronous/blocking, so threads (not asyncio) are the right @@ -48,11 +94,10 @@ def _run_reference( try: response = call_llm( task="moa_reference", - provider=slot["provider"], - model=slot["model"], messages=ref_messages, temperature=temperature, max_tokens=max_tokens, + **_slot_runtime(slot), ) return label, _extract_text(response) or "(empty response)" except Exception as exc: @@ -64,8 +109,8 @@ def _run_references_parallel( reference_models: list[dict[str, str]], ref_messages: list[dict[str, Any]], *, - temperature: float, - max_tokens: int, + temperature: float | None = None, + max_tokens: int | None = None, ) -> list[tuple[str, str]]: """Fan out all reference models in parallel, returning outputs in order. @@ -169,12 +214,18 @@ def aggregate_moa_context( aggregator: dict[str, str], temperature: float = 0.6, aggregator_temperature: float = 0.4, - max_tokens: int = 4096, + max_tokens: int | None = None, ) -> str: """Run configured reference models and synthesize their advice. Failures are returned as model-specific notes instead of aborting the normal agent loop; the main model can still act with partial context. + + ``max_tokens`` is ``None`` by default: MoA does not cap reference or + aggregator output, so each model uses its own maximum. ``call_llm`` omits + the parameter entirely when it is ``None`` (see its docstring), which also + sidesteps providers that reject ``max_tokens`` outright. A hardcoded cap + here previously truncated long aggregator syntheses. """ reference_outputs: list[tuple[str, str]] = [] ref_messages = _reference_messages(api_messages) @@ -203,11 +254,10 @@ def aggregate_moa_context( try: response = call_llm( task="moa_aggregator", - provider=aggregator["provider"], - model=aggregator["model"], messages=[{"role": "user", "content": synth_prompt}], temperature=aggregator_temperature, max_tokens=max_tokens, + **_slot_runtime(aggregator), ) synthesis = _extract_text(response) except Exception as exc: @@ -241,7 +291,10 @@ class MoAChatCompletions: messages = list(api_kwargs.get("messages") or []) reference_models = preset.get("reference_models") or [] aggregator = preset.get("aggregator") or {} - max_tokens = int(preset.get("max_tokens", api_kwargs.get("max_tokens") or 4096) or 4096) + # MoA does not cap reference or aggregator output: each model uses its + # own maximum. Passing max_tokens=None makes call_llm omit the parameter + # (it never caps by default), so a long aggregator synthesis is never + # truncated and providers that reject max_tokens don't 400. temperature = float(preset.get("reference_temperature", 0.6) or 0.6) aggregator_temperature = float(preset.get("aggregator_temperature", api_kwargs.get("temperature") or 0.4) or 0.4) @@ -257,7 +310,7 @@ class MoAChatCompletions: reference_models, ref_messages, temperature=temperature, - max_tokens=max_tokens, + max_tokens=None, ) agg_messages = [dict(m) for m in messages] @@ -286,17 +339,22 @@ class MoAChatCompletions: raise RuntimeError("MoA aggregator cannot be another MoA preset") agg_kwargs = dict(api_kwargs) agg_kwargs["messages"] = agg_messages - agg_kwargs["model"] = aggregator.get("model") - agg_kwargs["temperature"] = aggregator_temperature + # The aggregator is the acting model. Resolve its slot to the provider's + # real runtime (base_url/api_key/api_mode) and call it through the same + # request-building path any model uses — so per-model wire-format + # handling (anthropic_messages, max_completion_tokens, fixed/forbidden + # temperature) applies identically to it. MoA imposes no output cap: + # max_tokens is passed through from the caller (normally None → omitted + # → the model's real maximum). The preset's old hardcoded 4096 default + # is gone — it truncated long syntheses. return call_llm( task="moa_aggregator", - provider=aggregator.get("provider"), - model=aggregator.get("model"), messages=agg_messages, temperature=aggregator_temperature, max_tokens=agg_kwargs.get("max_tokens"), tools=agg_kwargs.get("tools"), extra_body=agg_kwargs.get("extra_body"), + **_slot_runtime(aggregator), ) diff --git a/tests/run_agent/test_moa_loop_mode.py b/tests/run_agent/test_moa_loop_mode.py index 766e12347b0..ba8f86f03a4 100644 --- a/tests/run_agent/test_moa_loop_mode.py +++ b/tests/run_agent/test_moa_loop_mode.py @@ -61,6 +61,117 @@ moa: assert calls[1]["tools"] is not None +def test_moa_does_not_cap_output_tokens(monkeypatch, tmp_path): + """MoA must not inject an output cap on reference or aggregator calls. + + The preset's old hardcoded max_tokens=4096 truncated long aggregator + syntheses. MoA now passes max_tokens=None (no caller cap), so call_llm + omits the parameter and each model uses its real maximum. Regression for + the "no limit on MoA models" fix. + """ + home = tmp_path / ".hermes" + home.mkdir() + (home / "config.yaml").write_text( + """ +moa: + default_preset: review + presets: + review: + max_tokens: 4096 + reference_models: + - provider: openai-codex + model: gpt-5.5 + aggregator: + provider: openrouter + model: anthropic/claude-opus-4.8 +""".strip(), + encoding="utf-8", + ) + monkeypatch.setenv("HERMES_HOME", str(home)) + calls = [] + + def fake_call_llm(**kwargs): + calls.append(kwargs) + if kwargs["task"] == "moa_reference": + return _response("reference advice") + return _response("aggregator acted") + + monkeypatch.setattr("agent.moa_loop.call_llm", fake_call_llm) + + agent = AIAgent( + api_key="moa-virtual-provider", + base_url="moa://local", + model="review", + provider="moa", + quiet_mode=True, + skip_context_files=True, + skip_memory=True, + enabled_toolsets=["file"], + max_iterations=1, + ) + agent.run_conversation("solve this") + + # Even with a preset max_tokens: 4096 present in config, neither the + # reference nor the aggregator call carries a cap — MoA passes None and + # call_llm omits the parameter so the model uses its full output budget. + ref_call = next(c for c in calls if c["task"] == "moa_reference") + agg_call = next(c for c in calls if c["task"] == "moa_aggregator") + assert ref_call.get("max_tokens") is None + assert agg_call.get("max_tokens") is None + + +def test_moa_slots_routed_through_resolve_runtime_provider(monkeypatch): + """Reference + aggregator slots must be called via their provider's real + runtime (resolve_runtime_provider), not a bare provider/model call. + + This is the "call any model the way it's called elsewhere" contract: each + slot's resolved base_url/api_key is passed through to call_llm so the + provider's actual API surface (anthropic_messages, max_completion_tokens, + custom endpoints) applies — same as if the model were the acting model. + """ + from agent import moa_loop + + resolved = [] + + def fake_resolve(*, requested, target_model=None): + resolved.append((requested, target_model)) + return { + "provider": requested, + "api_mode": "chat_completions", + "base_url": f"https://{requested}.example/v1", + "api_key": f"key-for-{requested}", + } + + monkeypatch.setattr( + "hermes_cli.runtime_provider.resolve_runtime_provider", fake_resolve + ) + + rt = moa_loop._slot_runtime({"provider": "minimax", "model": "MiniMax-M2"}) + assert ("minimax", "MiniMax-M2") in resolved + assert rt["provider"] == "minimax" + assert rt["model"] == "MiniMax-M2" + assert rt["base_url"] == "https://minimax.example/v1" + assert rt["api_key"] == "key-for-minimax" + + +def test_moa_slot_runtime_falls_back_on_resolution_error(monkeypatch): + """A slot whose provider can't be resolved still attempts the call with the + bare provider/model rather than aborting the whole MoA turn.""" + from agent import moa_loop + + def boom(*, requested, target_model=None): + raise RuntimeError("unknown provider") + + monkeypatch.setattr( + "hermes_cli.runtime_provider.resolve_runtime_provider", boom + ) + + rt = moa_loop._slot_runtime({"provider": "mystery", "model": "x"}) + assert rt == {"provider": "mystery", "model": "x"} + assert "base_url" not in rt + assert "api_key" not in rt + + def test_reference_messages_strips_system_and_tool_history(): from agent.moa_loop import _reference_messages