mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
* feat(moa): expose MoA presets as selectable virtual models Reconstructed onto current main (PR #46081's base had diverged with no common ancestor, marking the PR dirty so CI never dispatched). MoA is now a virtual provider: each named preset is a selectable model under provider 'moa', and the preset's aggregator is the acting model that answers and calls tools. Reference models fan out in parallel via a bounded ThreadPoolExecutor (the same batch pattern delegate_task uses) — all references dispatched at once, collected when every one finishes, then handed to the aggregator. Output order is preserved, failures and the MoA-recursion guard stay isolated per reference. - Removed the old mixture_of_agents model tool and moa toolset. - Added moa as a virtual provider in the provider/model inventory. - /moa is shortcut behavior over model selection (default preset / named preset / one-shot prompt). - Dashboard + Desktop manage named presets; presets appear in model pickers. - Parallel reference fan-out in agent/moa_loop.py with regression test. * fix(moa): thread moa_config through _run_agent to _run_agent_inner The reconstructed gateway MoA wiring declared moa_config on _run_agent (the profile-scoping wrapper) and used it inside _run_agent_inner, but the wrapper never forwarded it — _run_agent_inner had no such parameter, so the runtime hit NameError: name 'moa_config' is not defined on the compression-failure session sync path. Add moa_config to _run_agent_inner's signature and forward it from both wrapper call sites (multiplex and non-multiplex). Caught by tests/gateway/test_compression_failure_session_sync.py on CI shard test(4). * fix(moa): classify moa as a virtual provider in the catalog The moa virtual provider has no PROVIDER_REGISTRY/ProviderProfile entry, so provider_catalog() fell through to the default auth_type="api_key" with no env vars — tripping two catalog invariants: - test_provider_catalog: api_key providers must expose a credential env var - test_provider_parity: every hermes-model provider must be desktop-configurable moa already declares auth_type="virtual" in HERMES_OVERLAYS; consult that overlay as an auth_type fallback so the catalog reports moa as virtual (no real credential, no network endpoint). Exempt virtual providers from the desktop parity union check the same way 'custom' is exempt — derived from the catalog, not a hardcoded slug, so future virtual providers are covered too.
306 lines
12 KiB
Python
306 lines
12 KiB
Python
"""Mixture-of-Agents runtime helpers for /moa turns.
|
|
|
|
The slash command is deliberately not a model tool. It marks one user turn as
|
|
MoA-enabled; the normal Hermes agent loop still owns tool calling and turn
|
|
termination, while this module gathers reference-model context before each model
|
|
iteration.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from typing import Any
|
|
|
|
from agent.auxiliary_client import call_llm
|
|
from agent.transports import get_transport
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Upper bound on concurrent reference-model calls. References are independent
|
|
# advisory calls (no tools, no inter-dependence), so we fan them out the same
|
|
# way delegate_task runs a batch: all in flight at once, results collected when
|
|
# every reference finishes. Presets rarely list more than a handful of
|
|
# references; this cap just protects against a pathologically large preset
|
|
# opening dozens of sockets at once.
|
|
_MAX_REFERENCE_WORKERS = 8
|
|
|
|
|
|
def _slot_label(slot: dict[str, str]) -> str:
|
|
return f"{slot.get('provider', '').strip()}:{slot.get('model', '').strip()}"
|
|
|
|
|
|
def _run_reference(
|
|
slot: dict[str, str],
|
|
ref_messages: list[dict[str, Any]],
|
|
*,
|
|
temperature: float,
|
|
max_tokens: int,
|
|
) -> tuple[str, str]:
|
|
"""Call one reference model and return ``(label, text)``.
|
|
|
|
Never raises: a failed reference becomes a labelled note so the aggregator
|
|
can still act with partial context. Designed to run inside a thread pool —
|
|
``call_llm`` is synchronous/blocking, so threads (not asyncio) are the right
|
|
concurrency primitive, mirroring ``delegate_task``'s batch fan-out.
|
|
"""
|
|
label = _slot_label(slot)
|
|
try:
|
|
response = call_llm(
|
|
task="moa_reference",
|
|
provider=slot["provider"],
|
|
model=slot["model"],
|
|
messages=ref_messages,
|
|
temperature=temperature,
|
|
max_tokens=max_tokens,
|
|
)
|
|
return label, _extract_text(response) or "(empty response)"
|
|
except Exception as exc:
|
|
logger.warning("MoA reference model %s failed: %s", label, exc)
|
|
return label, f"[failed: {exc}]"
|
|
|
|
|
|
def _run_references_parallel(
|
|
reference_models: list[dict[str, str]],
|
|
ref_messages: list[dict[str, Any]],
|
|
*,
|
|
temperature: float,
|
|
max_tokens: int,
|
|
) -> list[tuple[str, str]]:
|
|
"""Fan out all reference models in parallel, returning outputs in order.
|
|
|
|
Like ``delegate_task``'s batch mode, every reference is dispatched at once
|
|
and we block until all of them finish before handing the joined results to
|
|
the aggregator. Output order matches ``reference_models`` so the
|
|
``Reference {idx}`` labelling stays stable. MoA presets that reference
|
|
another MoA preset are skipped here (recursion guard) with a labelled note.
|
|
"""
|
|
if not reference_models:
|
|
return []
|
|
|
|
results: list[tuple[str, str] | None] = [None] * len(reference_models)
|
|
futures = {}
|
|
workers = min(_MAX_REFERENCE_WORKERS, len(reference_models))
|
|
with ThreadPoolExecutor(max_workers=workers) as executor:
|
|
for idx, slot in enumerate(reference_models):
|
|
if slot.get("provider") == "moa":
|
|
results[idx] = (
|
|
_slot_label(slot),
|
|
"[skipped: MoA presets cannot recursively reference MoA]",
|
|
)
|
|
continue
|
|
futures[
|
|
executor.submit(
|
|
_run_reference,
|
|
slot,
|
|
ref_messages,
|
|
temperature=temperature,
|
|
max_tokens=max_tokens,
|
|
)
|
|
] = idx
|
|
# Collect every reference before returning — the aggregator needs the
|
|
# complete set, so there is no early-exit / first-completed path here.
|
|
for future, idx in futures.items():
|
|
results[idx] = future.result()
|
|
|
|
return [r for r in results if r is not None]
|
|
|
|
|
|
def _reference_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
"""Build an advisory-safe view of the conversation for reference models.
|
|
|
|
Reference calls are advisory: they never call tools and never emit the
|
|
``tool_calls`` the main model did. Replaying the full transcript verbatim
|
|
(a) re-bills the ~8K-token Hermes system prompt per reference per
|
|
iteration and (b) risks 400s from strict providers (Mistral, Fireworks)
|
|
that reject orphan ``tool`` messages or ``tool_calls`` the reference never
|
|
produced. We keep only the user/assistant *text* turns, dropping the
|
|
system prompt, any ``tool``-role messages, and any ``tool_calls`` payloads.
|
|
"""
|
|
trimmed: list[dict[str, Any]] = []
|
|
for msg in messages:
|
|
role = msg.get("role")
|
|
if role not in ("user", "assistant"):
|
|
# Drop system prompt and tool-result messages.
|
|
continue
|
|
content = msg.get("content")
|
|
if not isinstance(content, str):
|
|
# Skip non-text (multimodal/tool-call-only) assistant turns.
|
|
if not content:
|
|
continue
|
|
text = content if isinstance(content, str) else ""
|
|
if role == "assistant" and not text.strip():
|
|
# Assistant turn that was purely tool calls — nothing advisory.
|
|
continue
|
|
trimmed.append({"role": role, "content": text})
|
|
if not trimmed:
|
|
# Degenerate case (e.g. first turn was stripped): fall back to a
|
|
# minimal user turn so the reference still has something to answer.
|
|
for msg in reversed(messages):
|
|
if msg.get("role") == "user" and isinstance(msg.get("content"), str):
|
|
return [{"role": "user", "content": msg["content"]}]
|
|
return trimmed
|
|
|
|
|
|
|
|
def _extract_text(response: Any) -> str:
|
|
try:
|
|
transport = get_transport("chat_completions")
|
|
if transport is None:
|
|
raise RuntimeError("chat_completions transport unavailable")
|
|
normalized = transport.normalize_response(response)
|
|
text = (normalized.content or "").strip()
|
|
if text:
|
|
return text
|
|
except Exception:
|
|
pass
|
|
try:
|
|
content = response.choices[0].message.content
|
|
return (content or "").strip()
|
|
except Exception:
|
|
return ""
|
|
|
|
|
|
def aggregate_moa_context(
|
|
*,
|
|
user_prompt: str,
|
|
api_messages: list[dict[str, Any]],
|
|
reference_models: list[dict[str, str]],
|
|
aggregator: dict[str, str],
|
|
temperature: float = 0.6,
|
|
aggregator_temperature: float = 0.4,
|
|
max_tokens: int = 4096,
|
|
) -> str:
|
|
"""Run configured reference models and synthesize their advice.
|
|
|
|
Failures are returned as model-specific notes instead of aborting the normal
|
|
agent loop; the main model can still act with partial context.
|
|
"""
|
|
reference_outputs: list[tuple[str, str]] = []
|
|
ref_messages = _reference_messages(api_messages)
|
|
reference_outputs = _run_references_parallel(
|
|
reference_models,
|
|
ref_messages,
|
|
temperature=temperature,
|
|
max_tokens=max_tokens,
|
|
)
|
|
|
|
joined = "\n\n".join(
|
|
f"Reference {idx} — {label}:\n{text}"
|
|
for idx, (label, text) in enumerate(reference_outputs, start=1)
|
|
)
|
|
synth_prompt = (
|
|
"You are the aggregator in a Mixture of Agents process. Synthesize the "
|
|
"reference responses into concise, actionable guidance for the main "
|
|
"Hermes agent. Focus on next steps, tool-use strategy, risks, and any "
|
|
"disagreements. Do not answer the user directly unless that is all that "
|
|
"is needed; produce context the main agent should use in its normal loop.\n\n"
|
|
f"Original user prompt:\n{user_prompt}\n\n"
|
|
f"Reference responses:\n{joined}"
|
|
)
|
|
|
|
agg_label = _slot_label(aggregator)
|
|
try:
|
|
response = call_llm(
|
|
task="moa_aggregator",
|
|
provider=aggregator["provider"],
|
|
model=aggregator["model"],
|
|
messages=[{"role": "user", "content": synth_prompt}],
|
|
temperature=aggregator_temperature,
|
|
max_tokens=max_tokens,
|
|
)
|
|
synthesis = _extract_text(response)
|
|
except Exception as exc:
|
|
logger.warning("MoA aggregator model %s failed: %s", agg_label, exc)
|
|
synthesis = ""
|
|
|
|
if not synthesis:
|
|
synthesis = joined
|
|
|
|
return (
|
|
"[Mixture of Agents context — use this as private guidance for the "
|
|
"normal Hermes agent loop. You may call tools, continue reasoning, or "
|
|
"finish normally.]\n"
|
|
f"Aggregator: {agg_label}\n"
|
|
f"References: {', '.join(_slot_label(slot) for slot in reference_models)}\n\n"
|
|
f"{synthesis.strip()}"
|
|
)
|
|
|
|
|
|
class MoAChatCompletions:
|
|
"""OpenAI-chat-compatible facade where the aggregator is the acting model."""
|
|
|
|
def __init__(self, preset_name: str):
|
|
self.preset_name = preset_name or "default"
|
|
|
|
def create(self, **api_kwargs: Any) -> Any:
|
|
from hermes_cli.config import load_config
|
|
from hermes_cli.moa_config import resolve_moa_preset
|
|
|
|
preset = resolve_moa_preset(load_config().get("moa") or {}, self.preset_name)
|
|
messages = list(api_kwargs.get("messages") or [])
|
|
reference_models = preset.get("reference_models") or []
|
|
aggregator = preset.get("aggregator") or {}
|
|
max_tokens = int(preset.get("max_tokens", api_kwargs.get("max_tokens") or 4096) or 4096)
|
|
temperature = float(preset.get("reference_temperature", 0.6) or 0.6)
|
|
aggregator_temperature = float(preset.get("aggregator_temperature", api_kwargs.get("temperature") or 0.4) or 0.4)
|
|
|
|
# When the preset is disabled, skip the reference fan-out and let the
|
|
# configured aggregator act alone — it is the preset's acting model, so
|
|
# a disabled MoA preset is simply "use the aggregator directly."
|
|
if not preset.get("enabled", True):
|
|
reference_models = []
|
|
|
|
reference_outputs: list[tuple[str, str]] = []
|
|
ref_messages = _reference_messages(messages)
|
|
reference_outputs = _run_references_parallel(
|
|
reference_models,
|
|
ref_messages,
|
|
temperature=temperature,
|
|
max_tokens=max_tokens,
|
|
)
|
|
|
|
agg_messages = [dict(m) for m in messages]
|
|
if reference_outputs:
|
|
joined = "\n\n".join(
|
|
f"Reference {idx} — {label}:\n{text}"
|
|
for idx, (label, text) in enumerate(reference_outputs, start=1)
|
|
)
|
|
guidance = (
|
|
"[Mixture of Agents reference context]\n"
|
|
f"Preset: {self.preset_name}\n"
|
|
f"Aggregator/acting model: {_slot_label(aggregator)}\n"
|
|
f"References: {', '.join(label for label, _ in reference_outputs)}\n\n"
|
|
"Use the reference responses below as private context. You are the aggregator and acting model: "
|
|
"answer the user directly or call tools as needed.\n\n"
|
|
f"{joined}"
|
|
)
|
|
for msg in reversed(agg_messages):
|
|
if msg.get("role") == "user" and isinstance(msg.get("content"), str):
|
|
msg["content"] = msg["content"] + "\n\n" + guidance
|
|
break
|
|
else:
|
|
agg_messages.append({"role": "user", "content": guidance})
|
|
|
|
if aggregator.get("provider") == "moa":
|
|
raise RuntimeError("MoA aggregator cannot be another MoA preset")
|
|
agg_kwargs = dict(api_kwargs)
|
|
agg_kwargs["messages"] = agg_messages
|
|
agg_kwargs["model"] = aggregator.get("model")
|
|
agg_kwargs["temperature"] = aggregator_temperature
|
|
return call_llm(
|
|
task="moa_aggregator",
|
|
provider=aggregator.get("provider"),
|
|
model=aggregator.get("model"),
|
|
messages=agg_messages,
|
|
temperature=aggregator_temperature,
|
|
max_tokens=agg_kwargs.get("max_tokens"),
|
|
tools=agg_kwargs.get("tools"),
|
|
extra_body=agg_kwargs.get("extra_body"),
|
|
)
|
|
|
|
|
|
class MoAClient:
|
|
def __init__(self, preset_name: str):
|
|
self.chat = type("_MoAChat", (), {})()
|
|
self.chat.completions = MoAChatCompletions(preset_name)
|