fix(gateway): refuse to write service definitions with a temp-dir HERMES_HOME (#44267)

* fix(gateway): refuse to write service definitions with a temp-dir HERMES_HOME

A test/E2E harness that exports HERMES_HOME=/tmp/... and touches any
gateway service write path (install, start self-heal, restart's
refresh_systemd_unit_if_needed) bakes the throwaway home into the
production systemd unit / launchd plist. The gateway then restarts
'healthy' but pointed at an empty temp home — no platforms enabled,
deaf to every message (live incident 2026-06-11: /tmp/hermes-e2e-41264
poisoned the unit during a PR-review E2E probe; the post-update restart
produced a 7-hour zombie gateway).

The existing safety belt only sniffed pytest-shaped markers
(/pytest-of-, /hermes_test). Add a structural guard:
_temp_home_in_service_definition() extracts HERMES_HOME from the
generated systemd unit or launchd plist and refuses the write (with
actionable guidance) when it resolves under tempfile.gettempdir(),
/tmp, /var/tmp, or the macOS /private variants. Wired into all five
write sites: systemd refresh + install, launchd refresh + install +
start self-heal.

* test: patch unit generator in install tests tripped by temp-home guard

CI runs hermetic with HERMES_HOME under a tmp dir, so the real
generate_systemd_unit() output now (correctly) trips the new temp-home
write guard in three install tests. Patch the generator with synthetic
non-temp content — same pattern the existing pytest-marker guard tests
use.
This commit is contained in:
Teknium 2026-06-11 06:10:08 -07:00 committed by GitHub
parent 8972a151a4
commit f456f302df
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 231 additions and 6 deletions

View file

@ -2531,6 +2531,65 @@ def systemd_unit_is_current(system: bool = False) -> bool:
return norm_installed == norm_expected
def _temp_home_in_service_definition(definition: str) -> str | None:
"""Return the temp-dir HERMES_HOME baked into a service definition, or None.
A generated systemd unit / launchd plist carries the resolved HERMES_HOME
in its environment block. If that path lives under the system temp dir,
the definition was almost certainly generated by a test/E2E harness that
exported a throwaway ``HERMES_HOME=/tmp/...`` writing it to the real
service file silently breaks the user's gateway on the next (re)start:
the gateway comes back "active (running)" but pointed at an empty temp
home ("No messaging platforms enabled"), deaf to every platform.
Seen live 2026-06-11: an E2E guard probe ran ``hermes gateway restart``
with ``HERMES_HOME=/tmp/hermes-e2e-<pr>`` exported; the restart path's
unit refresh baked the temp path into the production unit and the
post-update restart produced a zombie gateway for 7+ hours.
Matches both systemd ``Environment="HERMES_HOME=..."`` lines and launchd
``<key>HERMES_HOME</key><string>...</string>`` pairs.
"""
import re
import tempfile
candidates = re.findall(r'HERMES_HOME=([^"\n]+)', definition)
candidates += re.findall(
r"<key>HERMES_HOME</key>\s*<string>(.*?)</string>", definition, flags=re.S
)
temp_roots = {
Path(tempfile.gettempdir()).resolve(),
Path("/tmp"),
Path("/var/tmp"),
Path("/private/tmp"),
Path("/private/var/tmp"),
}
for raw in candidates:
try:
resolved = Path(raw.strip().strip('"')).resolve()
except (OSError, ValueError):
continue
for root in temp_roots:
if resolved == root or root in resolved.parents:
return raw.strip()
return None
def _refuse_temp_home_service_write(definition: str, kind: str) -> bool:
"""Refuse (with guidance) when a service definition carries a temp HERMES_HOME."""
temp_home = _temp_home_in_service_definition(definition)
if temp_home is None:
return False
print(
f"✗ Refusing to write the gateway {kind}: HERMES_HOME resolves to a "
f"temporary directory ({temp_home})."
)
print(
" This usually means a test/E2E environment exported HERMES_HOME. "
"Unset it (or run from a clean shell) and retry."
)
return True
def refresh_systemd_unit_if_needed(system: bool = False) -> bool:
"""Rewrite the installed systemd unit when the generated definition has changed."""
unit_path = get_systemd_unit_path(system=system)
@ -2561,6 +2620,12 @@ def refresh_systemd_unit_if_needed(system: bool = False) -> bool:
):
return False
# Structural variant of the same belt: refuse to bake ANY temp-dir
# HERMES_HOME into the unit (manual E2E homes like /tmp/hermes-e2e-NNN
# don't carry the pytest markers above but poison the unit identically).
if _refuse_temp_home_service_write(new_unit, "systemd unit"):
return False
unit_path.write_text(new_unit, encoding="utf-8")
_run_systemctl(["daemon-reload"], system=system, check=True, timeout=30)
print(
@ -2729,10 +2794,11 @@ def systemd_install(
return
unit_path.parent.mkdir(parents=True, exist_ok=True)
new_unit = generate_systemd_unit(system=system, run_as_user=run_as_user)
if _refuse_temp_home_service_write(new_unit, "systemd unit"):
return
print(f"Installing {_service_scope_label(system)} systemd service to: {unit_path}")
unit_path.write_text(
generate_systemd_unit(system=system, run_as_user=run_as_user), encoding="utf-8"
)
unit_path.write_text(new_unit, encoding="utf-8")
_run_systemctl(["daemon-reload"], system=system, check=True, timeout=30)
if enable_on_startup:
@ -3362,7 +3428,11 @@ def refresh_launchd_plist_if_needed() -> bool:
if not plist_path.exists() or launchd_plist_is_current():
return False
plist_path.write_text(generate_launchd_plist(), encoding="utf-8")
new_plist = generate_launchd_plist()
if _refuse_temp_home_service_write(new_plist, "launchd plist"):
return False
plist_path.write_text(new_plist, encoding="utf-8")
label = get_launchd_label()
# Bootout/bootstrap so launchd picks up the new definition
subprocess.run(
@ -3395,8 +3465,11 @@ def launchd_install(force: bool = False):
return
plist_path.parent.mkdir(parents=True, exist_ok=True)
new_plist = generate_launchd_plist()
if _refuse_temp_home_service_write(new_plist, "launchd plist"):
return
print(f"Installing launchd service to: {plist_path}")
plist_path.write_text(generate_launchd_plist())
plist_path.write_text(new_plist)
try:
subprocess.run(
@ -3442,9 +3515,12 @@ def launchd_start():
# Self-heal if the plist is missing entirely (e.g., manual cleanup, failed upgrade)
if not plist_path.exists():
new_plist = generate_launchd_plist()
if _refuse_temp_home_service_write(new_plist, "launchd plist"):
sys.exit(1)
print("↻ launchd plist missing; regenerating service definition")
plist_path.parent.mkdir(parents=True, exist_ok=True)
plist_path.write_text(generate_launchd_plist(), encoding="utf-8")
plist_path.write_text(new_plist, encoding="utf-8")
try:
subprocess.run(
["launchctl", "bootstrap", _launchd_domain(), str(plist_path)],

View file

@ -369,6 +369,16 @@ def test_systemd_install_checks_linger_status(monkeypatch, tmp_path, capsys):
unit_path = tmp_path / "systemd" / "user" / "hermes-gateway.service"
monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda system=False: unit_path)
# Synthetic unit with a non-temp home: the real generator bakes the
# hermetic test HERMES_HOME (a tmp dir), which the temp-home write
# guard correctly refuses.
monkeypatch.setattr(
gateway,
"generate_systemd_unit",
lambda system=False, run_as_user=None: (
'[Service]\nEnvironment="HERMES_HOME=/home/alice/.hermes"\n'
),
)
calls = []
helper_calls = []
@ -396,6 +406,15 @@ def test_systemd_install_can_skip_enable_on_startup(monkeypatch, tmp_path, capsy
unit_path = tmp_path / "systemd" / "user" / "hermes-gateway.service"
monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda system=False: unit_path)
# Non-temp home so the temp-home write guard (which trips on the
# hermetic test HERMES_HOME) stays out of the way.
monkeypatch.setattr(
gateway,
"generate_systemd_unit",
lambda system=False, run_as_user=None: (
'[Service]\nEnvironment="HERMES_HOME=/home/alice/.hermes"\n'
),
)
calls = []
helper_calls = []

View file

@ -102,6 +102,15 @@ def test_systemd_install_calls_linger_helper(monkeypatch, tmp_path, capsys):
unit_path = tmp_path / "systemd" / "user" / "hermes-gateway.service"
monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda system=False: unit_path)
# Non-temp home so the temp-home write guard (which trips on the
# hermetic test HERMES_HOME) stays out of the way.
monkeypatch.setattr(
gateway,
"generate_systemd_unit",
lambda system=False, run_as_user=None: (
'[Service]\nEnvironment="HERMES_HOME=/home/alice/.hermes"\n'
),
)
calls = []

View file

@ -289,6 +289,105 @@ class TestSystemdServiceRefresh:
"daemon-reload" in str(c) for c in ran
), "daemon-reload must not run when write was refused"
def test_refresh_refuses_to_bake_any_tempdir_home_into_real_user_unit(
self, tmp_path, monkeypatch
):
"""Structural guard: a manual E2E HERMES_HOME like
``/tmp/hermes-e2e-41264`` carries none of the pytest markers but
poisons the unit identically (seen live 2026-06-11 an E2E probe ran
``hermes gateway restart`` with a /tmp HERMES_HOME exported; the
restart's unit refresh baked it into the production unit and the
post-update restart produced a 7-hour zombie gateway). The refresh
must refuse ANY temp-dir HERMES_HOME, not just pytest-shaped ones.
"""
unit_path = tmp_path / "hermes-gateway.service"
unit_path.write_text("old unit\n", encoding="utf-8")
monkeypatch.setattr(
gateway_cli, "get_systemd_unit_path", lambda system=False: unit_path
)
polluted_unit = (
"[Service]\n"
'Environment="HERMES_HOME=/tmp/hermes-e2e-41264"\n'
"WorkingDirectory=/tmp/hermes-e2e-41264\n"
)
monkeypatch.setattr(
gateway_cli,
"generate_systemd_unit",
lambda system=False, run_as_user=None: polluted_unit,
)
ran = []
def fake_run(cmd, check=True, **kwargs):
ran.append(cmd)
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
result = gateway_cli.refresh_systemd_unit_if_needed(system=False)
assert result is False, "refresh should refuse to write a temp-home unit"
assert (
unit_path.read_text(encoding="utf-8") == "old unit\n"
), "installed unit must be left untouched"
assert not any(
"daemon-reload" in str(c) for c in ran
), "daemon-reload must not run when write was refused"
class TestTempHomeServiceDefinitionGuard:
"""_temp_home_in_service_definition() — structural temp-dir detection."""
def test_detects_tmp_home_in_systemd_unit(self):
unit = '[Service]\nEnvironment="HERMES_HOME=/tmp/hermes-e2e-41264"\n'
assert (
gateway_cli._temp_home_in_service_definition(unit)
== "/tmp/hermes-e2e-41264"
)
def test_detects_var_tmp_home(self):
unit = '[Service]\nEnvironment="HERMES_HOME=/var/tmp/hermes-x"\n'
assert gateway_cli._temp_home_in_service_definition(unit) is not None
def test_detects_tempdir_env_home(self, monkeypatch, tmp_path):
import tempfile as _tempfile
monkeypatch.setattr(_tempfile, "gettempdir", lambda: str(tmp_path))
unit = f'[Service]\nEnvironment="HERMES_HOME={tmp_path}/hermes-home"\n'
assert gateway_cli._temp_home_in_service_definition(unit) is not None
def test_detects_tmp_home_in_launchd_plist(self):
plist = (
"<dict>\n <key>HERMES_HOME</key>\n"
" <string>/tmp/hermes-e2e-99999</string>\n</dict>\n"
)
assert (
gateway_cli._temp_home_in_service_definition(plist)
== "/tmp/hermes-e2e-99999"
)
def test_accepts_real_home(self):
unit = '[Service]\nEnvironment="HERMES_HOME=/home/alice/.hermes"\n'
assert gateway_cli._temp_home_in_service_definition(unit) is None
def test_accepts_macos_real_home_plist(self):
plist = (
"<dict>\n <key>HERMES_HOME</key>\n"
" <string>/Users/alice/.hermes</string>\n</dict>\n"
)
assert gateway_cli._temp_home_in_service_definition(plist) is None
def test_accepts_unit_without_hermes_home(self):
unit = "[Service]\nExecStart=/usr/bin/python -m hermes_cli.main gateway run\n"
assert gateway_cli._temp_home_in_service_definition(unit) is None
def test_tmp_prefixed_non_temp_path_is_accepted(self):
# /tmpfs-data is NOT under /tmp — prefix matching must be
# component-wise, not string startswith.
unit = '[Service]\nEnvironment="HERMES_HOME=/tmpfs-data/.hermes"\n'
assert gateway_cli._temp_home_in_service_definition(unit) is None
class TestRequireServiceInstalled:
def test_exits_with_install_hint_when_unit_missing(self, tmp_path, monkeypatch, capsys):
@ -481,6 +580,17 @@ class TestLaunchdServiceRecovery:
plist_path.write_text("<plist>old content</plist>", encoding="utf-8")
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
# Patch the generator with synthetic content carrying a real-looking
# home — the temp-home guard refuses to write plists whose
# HERMES_HOME resolves under the (pytest tmp) test HERMES_HOME.
monkeypatch.setattr(
gateway_cli,
"generate_launchd_plist",
lambda: (
"<plist>--replace\n<key>HERMES_HOME</key>"
"<string>/Users/alice/.hermes</string></plist>"
),
)
calls = []
@ -776,6 +886,17 @@ class TestLaunchdServiceRecovery:
"""macOS bootstrap error 5 should spawn a detached gateway, not crash."""
plist_path = tmp_path / "ai.hermes.gateway.plist"
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
# Synthetic plist with a non-temp home so the temp-home write guard
# (which would trip on the pytest-tmp test HERMES_HOME) stays out of
# the way — this test exercises the bootstrap-error fallback.
monkeypatch.setattr(
gateway_cli,
"generate_launchd_plist",
lambda: (
"<plist><key>HERMES_HOME</key>"
"<string>/Users/alice/.hermes</string></plist>"
),
)
def fake_run(cmd, check=False, **kwargs):
if cmd[:2] == ["launchctl", "bootstrap"]: