diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 1e455555e9a..c1f7c04d7f2 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -2531,6 +2531,65 @@ def systemd_unit_is_current(system: bool = False) -> bool: return norm_installed == norm_expected +def _temp_home_in_service_definition(definition: str) -> str | None: + """Return the temp-dir HERMES_HOME baked into a service definition, or None. + + A generated systemd unit / launchd plist carries the resolved HERMES_HOME + in its environment block. If that path lives under the system temp dir, + the definition was almost certainly generated by a test/E2E harness that + exported a throwaway ``HERMES_HOME=/tmp/...`` — writing it to the real + service file silently breaks the user's gateway on the next (re)start: + the gateway comes back "active (running)" but pointed at an empty temp + home ("No messaging platforms enabled"), deaf to every platform. + Seen live 2026-06-11: an E2E guard probe ran ``hermes gateway restart`` + with ``HERMES_HOME=/tmp/hermes-e2e-`` exported; the restart path's + unit refresh baked the temp path into the production unit and the + post-update restart produced a zombie gateway for 7+ hours. + + Matches both systemd ``Environment="HERMES_HOME=..."`` lines and launchd + ``HERMES_HOME...`` pairs. + """ + import re + import tempfile + + candidates = re.findall(r'HERMES_HOME=([^"\n]+)', definition) + candidates += re.findall( + r"HERMES_HOME\s*(.*?)", definition, flags=re.S + ) + temp_roots = { + Path(tempfile.gettempdir()).resolve(), + Path("/tmp"), + Path("/var/tmp"), + Path("/private/tmp"), + Path("/private/var/tmp"), + } + for raw in candidates: + try: + resolved = Path(raw.strip().strip('"')).resolve() + except (OSError, ValueError): + continue + for root in temp_roots: + if resolved == root or root in resolved.parents: + return raw.strip() + return None + + +def _refuse_temp_home_service_write(definition: str, kind: str) -> bool: + """Refuse (with guidance) when a service definition carries a temp HERMES_HOME.""" + temp_home = _temp_home_in_service_definition(definition) + if temp_home is None: + return False + print( + f"✗ Refusing to write the gateway {kind}: HERMES_HOME resolves to a " + f"temporary directory ({temp_home})." + ) + print( + " This usually means a test/E2E environment exported HERMES_HOME. " + "Unset it (or run from a clean shell) and retry." + ) + return True + + def refresh_systemd_unit_if_needed(system: bool = False) -> bool: """Rewrite the installed systemd unit when the generated definition has changed.""" unit_path = get_systemd_unit_path(system=system) @@ -2561,6 +2620,12 @@ def refresh_systemd_unit_if_needed(system: bool = False) -> bool: ): return False + # Structural variant of the same belt: refuse to bake ANY temp-dir + # HERMES_HOME into the unit (manual E2E homes like /tmp/hermes-e2e-NNN + # don't carry the pytest markers above but poison the unit identically). + if _refuse_temp_home_service_write(new_unit, "systemd unit"): + return False + unit_path.write_text(new_unit, encoding="utf-8") _run_systemctl(["daemon-reload"], system=system, check=True, timeout=30) print( @@ -2729,10 +2794,11 @@ def systemd_install( return unit_path.parent.mkdir(parents=True, exist_ok=True) + new_unit = generate_systemd_unit(system=system, run_as_user=run_as_user) + if _refuse_temp_home_service_write(new_unit, "systemd unit"): + return print(f"Installing {_service_scope_label(system)} systemd service to: {unit_path}") - unit_path.write_text( - generate_systemd_unit(system=system, run_as_user=run_as_user), encoding="utf-8" - ) + unit_path.write_text(new_unit, encoding="utf-8") _run_systemctl(["daemon-reload"], system=system, check=True, timeout=30) if enable_on_startup: @@ -3362,7 +3428,11 @@ def refresh_launchd_plist_if_needed() -> bool: if not plist_path.exists() or launchd_plist_is_current(): return False - plist_path.write_text(generate_launchd_plist(), encoding="utf-8") + new_plist = generate_launchd_plist() + if _refuse_temp_home_service_write(new_plist, "launchd plist"): + return False + + plist_path.write_text(new_plist, encoding="utf-8") label = get_launchd_label() # Bootout/bootstrap so launchd picks up the new definition subprocess.run( @@ -3395,8 +3465,11 @@ def launchd_install(force: bool = False): return plist_path.parent.mkdir(parents=True, exist_ok=True) + new_plist = generate_launchd_plist() + if _refuse_temp_home_service_write(new_plist, "launchd plist"): + return print(f"Installing launchd service to: {plist_path}") - plist_path.write_text(generate_launchd_plist()) + plist_path.write_text(new_plist) try: subprocess.run( @@ -3442,9 +3515,12 @@ def launchd_start(): # Self-heal if the plist is missing entirely (e.g., manual cleanup, failed upgrade) if not plist_path.exists(): + new_plist = generate_launchd_plist() + if _refuse_temp_home_service_write(new_plist, "launchd plist"): + sys.exit(1) print("↻ launchd plist missing; regenerating service definition") plist_path.parent.mkdir(parents=True, exist_ok=True) - plist_path.write_text(generate_launchd_plist(), encoding="utf-8") + plist_path.write_text(new_plist, encoding="utf-8") try: subprocess.run( ["launchctl", "bootstrap", _launchd_domain(), str(plist_path)], diff --git a/tests/hermes_cli/test_gateway.py b/tests/hermes_cli/test_gateway.py index 0988f8fb64a..30773e1ed13 100644 --- a/tests/hermes_cli/test_gateway.py +++ b/tests/hermes_cli/test_gateway.py @@ -369,6 +369,16 @@ def test_systemd_install_checks_linger_status(monkeypatch, tmp_path, capsys): unit_path = tmp_path / "systemd" / "user" / "hermes-gateway.service" monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda system=False: unit_path) + # Synthetic unit with a non-temp home: the real generator bakes the + # hermetic test HERMES_HOME (a tmp dir), which the temp-home write + # guard correctly refuses. + monkeypatch.setattr( + gateway, + "generate_systemd_unit", + lambda system=False, run_as_user=None: ( + '[Service]\nEnvironment="HERMES_HOME=/home/alice/.hermes"\n' + ), + ) calls = [] helper_calls = [] @@ -396,6 +406,15 @@ def test_systemd_install_can_skip_enable_on_startup(monkeypatch, tmp_path, capsy unit_path = tmp_path / "systemd" / "user" / "hermes-gateway.service" monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda system=False: unit_path) + # Non-temp home so the temp-home write guard (which trips on the + # hermetic test HERMES_HOME) stays out of the way. + monkeypatch.setattr( + gateway, + "generate_systemd_unit", + lambda system=False, run_as_user=None: ( + '[Service]\nEnvironment="HERMES_HOME=/home/alice/.hermes"\n' + ), + ) calls = [] helper_calls = [] diff --git a/tests/hermes_cli/test_gateway_linger.py b/tests/hermes_cli/test_gateway_linger.py index 90f8ea3d708..4a34f7ab1b6 100644 --- a/tests/hermes_cli/test_gateway_linger.py +++ b/tests/hermes_cli/test_gateway_linger.py @@ -102,6 +102,15 @@ def test_systemd_install_calls_linger_helper(monkeypatch, tmp_path, capsys): unit_path = tmp_path / "systemd" / "user" / "hermes-gateway.service" monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda system=False: unit_path) + # Non-temp home so the temp-home write guard (which trips on the + # hermetic test HERMES_HOME) stays out of the way. + monkeypatch.setattr( + gateway, + "generate_systemd_unit", + lambda system=False, run_as_user=None: ( + '[Service]\nEnvironment="HERMES_HOME=/home/alice/.hermes"\n' + ), + ) calls = [] diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py index 13f1636e4a6..0c6d7ca836d 100644 --- a/tests/hermes_cli/test_gateway_service.py +++ b/tests/hermes_cli/test_gateway_service.py @@ -289,6 +289,105 @@ class TestSystemdServiceRefresh: "daemon-reload" in str(c) for c in ran ), "daemon-reload must not run when write was refused" + def test_refresh_refuses_to_bake_any_tempdir_home_into_real_user_unit( + self, tmp_path, monkeypatch + ): + """Structural guard: a manual E2E HERMES_HOME like + ``/tmp/hermes-e2e-41264`` carries none of the pytest markers but + poisons the unit identically (seen live 2026-06-11 — an E2E probe ran + ``hermes gateway restart`` with a /tmp HERMES_HOME exported; the + restart's unit refresh baked it into the production unit and the + post-update restart produced a 7-hour zombie gateway). The refresh + must refuse ANY temp-dir HERMES_HOME, not just pytest-shaped ones. + """ + unit_path = tmp_path / "hermes-gateway.service" + unit_path.write_text("old unit\n", encoding="utf-8") + + monkeypatch.setattr( + gateway_cli, "get_systemd_unit_path", lambda system=False: unit_path + ) + polluted_unit = ( + "[Service]\n" + 'Environment="HERMES_HOME=/tmp/hermes-e2e-41264"\n' + "WorkingDirectory=/tmp/hermes-e2e-41264\n" + ) + monkeypatch.setattr( + gateway_cli, + "generate_systemd_unit", + lambda system=False, run_as_user=None: polluted_unit, + ) + + ran = [] + + def fake_run(cmd, check=True, **kwargs): + ran.append(cmd) + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run) + + result = gateway_cli.refresh_systemd_unit_if_needed(system=False) + + assert result is False, "refresh should refuse to write a temp-home unit" + assert ( + unit_path.read_text(encoding="utf-8") == "old unit\n" + ), "installed unit must be left untouched" + assert not any( + "daemon-reload" in str(c) for c in ran + ), "daemon-reload must not run when write was refused" + + +class TestTempHomeServiceDefinitionGuard: + """_temp_home_in_service_definition() — structural temp-dir detection.""" + + def test_detects_tmp_home_in_systemd_unit(self): + unit = '[Service]\nEnvironment="HERMES_HOME=/tmp/hermes-e2e-41264"\n' + assert ( + gateway_cli._temp_home_in_service_definition(unit) + == "/tmp/hermes-e2e-41264" + ) + + def test_detects_var_tmp_home(self): + unit = '[Service]\nEnvironment="HERMES_HOME=/var/tmp/hermes-x"\n' + assert gateway_cli._temp_home_in_service_definition(unit) is not None + + def test_detects_tempdir_env_home(self, monkeypatch, tmp_path): + import tempfile as _tempfile + + monkeypatch.setattr(_tempfile, "gettempdir", lambda: str(tmp_path)) + unit = f'[Service]\nEnvironment="HERMES_HOME={tmp_path}/hermes-home"\n' + assert gateway_cli._temp_home_in_service_definition(unit) is not None + + def test_detects_tmp_home_in_launchd_plist(self): + plist = ( + "\n HERMES_HOME\n" + " /tmp/hermes-e2e-99999\n\n" + ) + assert ( + gateway_cli._temp_home_in_service_definition(plist) + == "/tmp/hermes-e2e-99999" + ) + + def test_accepts_real_home(self): + unit = '[Service]\nEnvironment="HERMES_HOME=/home/alice/.hermes"\n' + assert gateway_cli._temp_home_in_service_definition(unit) is None + + def test_accepts_macos_real_home_plist(self): + plist = ( + "\n HERMES_HOME\n" + " /Users/alice/.hermes\n\n" + ) + assert gateway_cli._temp_home_in_service_definition(plist) is None + + def test_accepts_unit_without_hermes_home(self): + unit = "[Service]\nExecStart=/usr/bin/python -m hermes_cli.main gateway run\n" + assert gateway_cli._temp_home_in_service_definition(unit) is None + + def test_tmp_prefixed_non_temp_path_is_accepted(self): + # /tmpfs-data is NOT under /tmp — prefix matching must be + # component-wise, not string startswith. + unit = '[Service]\nEnvironment="HERMES_HOME=/tmpfs-data/.hermes"\n' + assert gateway_cli._temp_home_in_service_definition(unit) is None + class TestRequireServiceInstalled: def test_exits_with_install_hint_when_unit_missing(self, tmp_path, monkeypatch, capsys): @@ -481,6 +580,17 @@ class TestLaunchdServiceRecovery: plist_path.write_text("old content", encoding="utf-8") monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path) + # Patch the generator with synthetic content carrying a real-looking + # home — the temp-home guard refuses to write plists whose + # HERMES_HOME resolves under the (pytest tmp) test HERMES_HOME. + monkeypatch.setattr( + gateway_cli, + "generate_launchd_plist", + lambda: ( + "--replace\nHERMES_HOME" + "/Users/alice/.hermes" + ), + ) calls = [] @@ -776,6 +886,17 @@ class TestLaunchdServiceRecovery: """macOS bootstrap error 5 should spawn a detached gateway, not crash.""" plist_path = tmp_path / "ai.hermes.gateway.plist" monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path) + # Synthetic plist with a non-temp home so the temp-home write guard + # (which would trip on the pytest-tmp test HERMES_HOME) stays out of + # the way — this test exercises the bootstrap-error fallback. + monkeypatch.setattr( + gateway_cli, + "generate_launchd_plist", + lambda: ( + "HERMES_HOME" + "/Users/alice/.hermes" + ), + ) def fake_run(cmd, check=False, **kwargs): if cmd[:2] == ["launchctl", "bootstrap"]: