fix(nemo-relay): align adaptive config with tool_parallelism mode

Signed-off-by: mnajafian-nv <mnajafian@nvidia.com>
This commit is contained in:
mnajafian-nv 2026-06-08 11:48:19 -07:00
parent a38003be3d
commit 021d1034d0
No known key found for this signature in database
GPG key ID: C0C3EEEE9FB11E38
3 changed files with 110 additions and 25 deletions

View file

@ -173,8 +173,8 @@ include an adaptive component in the same `plugins.toml`:
kind = "adaptive"
enabled = true
[components.config]
mode = "route"
[components.config.tool_parallelism]
mode = "observe_only"
```
When the adaptive component is enabled and the installed NeMo Relay runtime
@ -182,15 +182,16 @@ exposes `llm.execute(...)` / `tools.execute(...)`, Hermes routes LLM and tool
execution through those middleware boundaries. The observer hooks still emit
session, turn, approval, and subagent marks; the plugin skips its manual
`llm.call` and `tools.call` spans for executions that are already managed by
NeMo Relay.
NeMo Relay. `tool_parallelism.mode = "observe_only"` keeps tool scheduling
observational while still wrapping the real execution boundary.
For the full generic Hermes middleware contract, see
[`docs/middleware/README.md`](../../../docs/middleware/README.md).
## Canonical Local Examples
The examples below use the official `nemo-relay==0.3` distribution and a local
Ollama model served through the OpenAI-compatible API.
The observe-only examples in this section use the official `nemo-relay==0.3`
distribution and a local Ollama model served through the OpenAI-compatible API.
```bash
pip install "nemo-relay==0.3"
@ -404,8 +405,8 @@ version = 1
kind = "adaptive"
enabled = true
[components.config]
mode = "route"
[components.config.tool_parallelism]
mode = "observe_only"
```
Enable it for Hermes:
@ -438,11 +439,12 @@ for the same execution.
### Local Adaptive E2E
This example enables both NeMo Relay observability export and adaptive execution
middleware for a local Hermes run.
middleware for a local Hermes run. This path requires a NeMo Relay runtime that
supports `[components.config.tool_parallelism]`; the `nemo-relay==0.3`
install used by the earlier observability-only examples does not support this
adaptive config.
```bash
pip install "nemo-relay==0.3"
export HERMES_HOME=/tmp/hermes-middleware-test/hermes-home
mkdir -p "$HERMES_HOME" /tmp/hermes-middleware-test/nemo-relay
@ -484,8 +486,8 @@ agent_version = "local"
kind = "adaptive"
enabled = true
[components.config]
mode = "route"
[components.config.tool_parallelism]
mode = "observe_only"
TOML
export HERMES_NEMO_RELAY_PLUGINS_TOML=/tmp/hermes-middleware-test/nemo-relay/plugins.toml
@ -510,8 +512,8 @@ middleware_execution_ok
Expected ATOF shape:
```jsonl
{"kind":"scope","category":"llm","name":"custom","scope_category":"start","metadata":{"session_id":"middleware-demo-session"},"data":{"mode":"route"}}
{"kind":"scope","category":"tool","name":"terminal","scope_category":"start","metadata":{"session_id":"middleware-demo-session","tool_call_id":"call_terminal"},"data":{"mode":"route"}}
{"kind":"scope","category":"llm","name":"custom","scope_category":"start","metadata":{"session_id":"middleware-demo-session"},"data":{"mode":"observe_only"}}
{"kind":"scope","category":"tool","name":"terminal","scope_category":"start","metadata":{"session_id":"middleware-demo-session","tool_call_id":"call_terminal"},"data":{"mode":"observe_only"}}
{"kind":"scope","category":"tool","name":"terminal","scope_category":"end","metadata":{"session_id":"middleware-demo-session","tool_call_id":"call_terminal","status":"ok"},"data":"{\"output\":\"middleware_execution_ok\",\"exit_code\":0,\"error\":null}"}
```

View file

@ -44,7 +44,7 @@ class _Settings:
plugins_toml_path: str = ""
plugins_config: dict[str, Any] | None = None
adaptive_enabled: bool = False
adaptive_mode: str = "observe"
adaptive_mode: str = "observe_only"
atof_enabled: bool = False
atof_output_directory: str = ""
atof_filename: str = "hermes-atof.jsonl"
@ -611,11 +611,16 @@ def _enabled_component_config(
def _adaptive_mode(config: dict[str, Any] | None) -> str:
if not isinstance(config, dict):
return "observe"
return "observe_only"
tool_parallelism = config.get("tool_parallelism")
if isinstance(tool_parallelism, dict):
mode = tool_parallelism.get("mode")
if isinstance(mode, str) and mode.strip():
return mode.strip()
mode = config.get("mode")
if isinstance(mode, str) and mode.strip():
return mode.strip()
return "observe"
return "observe_only"
def _env(name: str) -> str:

View file

@ -457,8 +457,8 @@ version = 1
kind = "adaptive"
enabled = true
[components.config]
mode = "route"
[components.config.tool_parallelism]
mode = "observe_only"
""",
encoding="utf-8",
)
@ -506,7 +506,7 @@ mode = "route"
assert response.choices == [raw_choice]
assert seen_request["intercepted"] is True
execute_start = next(event for event in fake.events if event[0] == "llm.execute.start")
assert execute_start[3]["data"]["mode"] == "route"
assert execute_start[3]["data"]["mode"] == "observe_only"
execute_end = next(event for event in fake.events if event[0] == "llm.execute.end")
assert execute_end[2] == {
"model": "demo-model",
@ -527,6 +527,84 @@ mode = "route"
}
def _adaptive_llm_execute_mode(tmp_path, monkeypatch, plugins_toml_text: str) -> str:
fake = _FakeNemoRelay()
plugin = _fresh_plugin(monkeypatch, fake)
plugins_toml = tmp_path / "plugins.toml"
plugins_toml.write_text(plugins_toml_text, encoding="utf-8")
monkeypatch.setenv("HERMES_NEMO_RELAY_PLUGINS_TOML", str(plugins_toml))
plugin.on_llm_execution_middleware(
session_id="s1",
provider="anthropic",
model="demo-model",
request={"messages": [{"role": "user", "content": "hi"}]},
next_call=lambda request: {"raw": request},
)
execute_start = next(event for event in fake.events if event[0] == "llm.execute.start")
return execute_start[3]["data"]["mode"]
def test_nemo_relay_adaptive_llm_execution_middleware_defaults_to_observe_only_when_mode_is_unset(
tmp_path, monkeypatch
):
mode = _adaptive_llm_execute_mode(
tmp_path,
monkeypatch,
"""
version = 1
[[components]]
kind = "adaptive"
enabled = true
[components.config]
version = 1
""",
)
assert mode == "observe_only"
def test_nemo_relay_adaptive_llm_execution_middleware_accepts_legacy_top_level_mode(tmp_path, monkeypatch):
mode = _adaptive_llm_execute_mode(
tmp_path,
monkeypatch,
"""
version = 1
[[components]]
kind = "adaptive"
enabled = true
[components.config]
mode = "route"
""",
)
assert mode == "route"
def test_nemo_relay_adaptive_llm_execution_middleware_prefers_tool_parallelism_mode(tmp_path, monkeypatch):
mode = _adaptive_llm_execute_mode(
tmp_path,
monkeypatch,
"""
version = 1
[[components]]
kind = "adaptive"
enabled = true
[components.config]
mode = "route"
[components.config.tool_parallelism]
mode = "schedule"
""",
)
assert mode == "schedule"
def test_nemo_relay_llm_execution_middleware_calls_through_without_adaptive(monkeypatch):
fake = _FakeNemoRelay()
plugin = _fresh_plugin(monkeypatch, fake)
@ -555,8 +633,8 @@ version = 1
kind = "adaptive"
enabled = true
[components.config]
mode = "route"
[components.config.tool_parallelism]
mode = "observe_only"
""",
encoding="utf-8",
)
@ -582,7 +660,7 @@ mode = "route"
assert response == {"raw": True, "args": {"command": "pwd", "intercepted": True}}
assert seen_args["intercepted"] is True
execute_start = next(event for event in fake.events if event[0] == "tool.execute.start")
assert execute_start[3]["data"]["mode"] == "route"
assert execute_start[3]["data"]["mode"] == "observe_only"
assert execute_start[3]["data"]["tool_call_id"] == "tool-1"
@ -613,8 +691,8 @@ version = 1
kind = "adaptive"
enabled = true
[components.config]
mode = "route"
[components.config.tool_parallelism]
mode = "observe_only"
""",
encoding="utf-8",
)