fix(observability): recover after plugin-config clear failure

Ensure failed plugin-config clear operations still re-arm managed reinitialization on the next Hermes session.

Add focused regression coverage for successful init, failed final-session clear, and next-session recovery.

Signed-off-by: mnajafian-nv <mnajafian@nvidia.com>
This commit is contained in:
mnajafian-nv 2026-06-08 07:50:10 -07:00
parent ecd4679d8c
commit 728612c29c
No known key found for this signature in database
GPG key ID: C0C3EEEE9FB11E38
2 changed files with 46 additions and 3 deletions

View file

@ -93,9 +93,11 @@ class _Runtime:
clear = getattr(plugin_mod, "clear", None)
if not callable(clear):
return
_resolve_awaitable(clear())
self._plugin_config_initialized = False
self._plugin_config_needs_reinit = bool(self.settings.plugins_config)
try:
_resolve_awaitable(clear())
finally:
self._plugin_config_initialized = False
self._plugin_config_needs_reinit = bool(self.settings.plugins_config)
def _activate_direct_fallbacks(self) -> None:
self._plugin_config_needs_reinit = False

View file

@ -541,6 +541,47 @@ enabled = true
assert "hermes-session-s2" in scope_push_names
def test_nemo_relay_plugin_retries_plugins_toml_after_clear_failure(tmp_path, monkeypatch):
fake = _FakeNemoRelay()
initialize_calls = 0
async def _counting_initialize(config):
nonlocal initialize_calls
initialize_calls += 1
fake.events.append(("plugin.initialize.attempt", initialize_calls, config))
return {"diagnostics": []}
async def _failing_clear():
fake.events.append(("plugin.clear.failed",))
raise RuntimeError("boom")
fake.plugin.initialize = _counting_initialize
fake.plugin.clear = _failing_clear
plugin = _fresh_plugin(monkeypatch, fake)
plugins_toml = tmp_path / "plugins.toml"
plugins_toml.write_text(
"""
version = 1
[[components]]
kind = "observability"
enabled = true
""",
encoding="utf-8",
)
monkeypatch.setenv("HERMES_NEMO_RELAY_PLUGINS_TOML", str(plugins_toml))
plugin.on_session_start(session_id="s1")
plugin.on_session_finalize(session_id="s1", reason="shutdown")
plugin.on_session_start(session_id="s2")
event_names = [event[0] for event in fake.events]
assert event_names.count("plugin.initialize.attempt") == 2
assert event_names.count("plugin.clear.failed") == 1
scope_push_names = [event[1] for event in fake.events if event[0] == "scope.push"]
assert "hermes-session-s2" in scope_push_names
def test_nemo_relay_plugin_disables_direct_atif_when_plugins_toml_owns_atif(tmp_path, monkeypatch):
fake = _FakeNemoRelay()
plugin = _fresh_plugin(monkeypatch, fake)