mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-12 08:51:53 +00:00
fix(gateway): refuse to write service definitions with a temp-dir HERMES_HOME (#44267)
* fix(gateway): refuse to write service definitions with a temp-dir HERMES_HOME A test/E2E harness that exports HERMES_HOME=/tmp/... and touches any gateway service write path (install, start self-heal, restart's refresh_systemd_unit_if_needed) bakes the throwaway home into the production systemd unit / launchd plist. The gateway then restarts 'healthy' but pointed at an empty temp home — no platforms enabled, deaf to every message (live incident 2026-06-11: /tmp/hermes-e2e-41264 poisoned the unit during a PR-review E2E probe; the post-update restart produced a 7-hour zombie gateway). The existing safety belt only sniffed pytest-shaped markers (/pytest-of-, /hermes_test). Add a structural guard: _temp_home_in_service_definition() extracts HERMES_HOME from the generated systemd unit or launchd plist and refuses the write (with actionable guidance) when it resolves under tempfile.gettempdir(), /tmp, /var/tmp, or the macOS /private variants. Wired into all five write sites: systemd refresh + install, launchd refresh + install + start self-heal. * test: patch unit generator in install tests tripped by temp-home guard CI runs hermetic with HERMES_HOME under a tmp dir, so the real generate_systemd_unit() output now (correctly) trips the new temp-home write guard in three install tests. Patch the generator with synthetic non-temp content — same pattern the existing pytest-marker guard tests use.
This commit is contained in:
parent
8972a151a4
commit
f456f302df
4 changed files with 231 additions and 6 deletions
|
|
@ -2531,6 +2531,65 @@ def systemd_unit_is_current(system: bool = False) -> bool:
|
|||
return norm_installed == norm_expected
|
||||
|
||||
|
||||
def _temp_home_in_service_definition(definition: str) -> str | None:
|
||||
"""Return the temp-dir HERMES_HOME baked into a service definition, or None.
|
||||
|
||||
A generated systemd unit / launchd plist carries the resolved HERMES_HOME
|
||||
in its environment block. If that path lives under the system temp dir,
|
||||
the definition was almost certainly generated by a test/E2E harness that
|
||||
exported a throwaway ``HERMES_HOME=/tmp/...`` — writing it to the real
|
||||
service file silently breaks the user's gateway on the next (re)start:
|
||||
the gateway comes back "active (running)" but pointed at an empty temp
|
||||
home ("No messaging platforms enabled"), deaf to every platform.
|
||||
Seen live 2026-06-11: an E2E guard probe ran ``hermes gateway restart``
|
||||
with ``HERMES_HOME=/tmp/hermes-e2e-<pr>`` exported; the restart path's
|
||||
unit refresh baked the temp path into the production unit and the
|
||||
post-update restart produced a zombie gateway for 7+ hours.
|
||||
|
||||
Matches both systemd ``Environment="HERMES_HOME=..."`` lines and launchd
|
||||
``<key>HERMES_HOME</key><string>...</string>`` pairs.
|
||||
"""
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
candidates = re.findall(r'HERMES_HOME=([^"\n]+)', definition)
|
||||
candidates += re.findall(
|
||||
r"<key>HERMES_HOME</key>\s*<string>(.*?)</string>", definition, flags=re.S
|
||||
)
|
||||
temp_roots = {
|
||||
Path(tempfile.gettempdir()).resolve(),
|
||||
Path("/tmp"),
|
||||
Path("/var/tmp"),
|
||||
Path("/private/tmp"),
|
||||
Path("/private/var/tmp"),
|
||||
}
|
||||
for raw in candidates:
|
||||
try:
|
||||
resolved = Path(raw.strip().strip('"')).resolve()
|
||||
except (OSError, ValueError):
|
||||
continue
|
||||
for root in temp_roots:
|
||||
if resolved == root or root in resolved.parents:
|
||||
return raw.strip()
|
||||
return None
|
||||
|
||||
|
||||
def _refuse_temp_home_service_write(definition: str, kind: str) -> bool:
|
||||
"""Refuse (with guidance) when a service definition carries a temp HERMES_HOME."""
|
||||
temp_home = _temp_home_in_service_definition(definition)
|
||||
if temp_home is None:
|
||||
return False
|
||||
print(
|
||||
f"✗ Refusing to write the gateway {kind}: HERMES_HOME resolves to a "
|
||||
f"temporary directory ({temp_home})."
|
||||
)
|
||||
print(
|
||||
" This usually means a test/E2E environment exported HERMES_HOME. "
|
||||
"Unset it (or run from a clean shell) and retry."
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def refresh_systemd_unit_if_needed(system: bool = False) -> bool:
|
||||
"""Rewrite the installed systemd unit when the generated definition has changed."""
|
||||
unit_path = get_systemd_unit_path(system=system)
|
||||
|
|
@ -2561,6 +2620,12 @@ def refresh_systemd_unit_if_needed(system: bool = False) -> bool:
|
|||
):
|
||||
return False
|
||||
|
||||
# Structural variant of the same belt: refuse to bake ANY temp-dir
|
||||
# HERMES_HOME into the unit (manual E2E homes like /tmp/hermes-e2e-NNN
|
||||
# don't carry the pytest markers above but poison the unit identically).
|
||||
if _refuse_temp_home_service_write(new_unit, "systemd unit"):
|
||||
return False
|
||||
|
||||
unit_path.write_text(new_unit, encoding="utf-8")
|
||||
_run_systemctl(["daemon-reload"], system=system, check=True, timeout=30)
|
||||
print(
|
||||
|
|
@ -2729,10 +2794,11 @@ def systemd_install(
|
|||
return
|
||||
|
||||
unit_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
new_unit = generate_systemd_unit(system=system, run_as_user=run_as_user)
|
||||
if _refuse_temp_home_service_write(new_unit, "systemd unit"):
|
||||
return
|
||||
print(f"Installing {_service_scope_label(system)} systemd service to: {unit_path}")
|
||||
unit_path.write_text(
|
||||
generate_systemd_unit(system=system, run_as_user=run_as_user), encoding="utf-8"
|
||||
)
|
||||
unit_path.write_text(new_unit, encoding="utf-8")
|
||||
|
||||
_run_systemctl(["daemon-reload"], system=system, check=True, timeout=30)
|
||||
if enable_on_startup:
|
||||
|
|
@ -3362,7 +3428,11 @@ def refresh_launchd_plist_if_needed() -> bool:
|
|||
if not plist_path.exists() or launchd_plist_is_current():
|
||||
return False
|
||||
|
||||
plist_path.write_text(generate_launchd_plist(), encoding="utf-8")
|
||||
new_plist = generate_launchd_plist()
|
||||
if _refuse_temp_home_service_write(new_plist, "launchd plist"):
|
||||
return False
|
||||
|
||||
plist_path.write_text(new_plist, encoding="utf-8")
|
||||
label = get_launchd_label()
|
||||
# Bootout/bootstrap so launchd picks up the new definition
|
||||
subprocess.run(
|
||||
|
|
@ -3395,8 +3465,11 @@ def launchd_install(force: bool = False):
|
|||
return
|
||||
|
||||
plist_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
new_plist = generate_launchd_plist()
|
||||
if _refuse_temp_home_service_write(new_plist, "launchd plist"):
|
||||
return
|
||||
print(f"Installing launchd service to: {plist_path}")
|
||||
plist_path.write_text(generate_launchd_plist())
|
||||
plist_path.write_text(new_plist)
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
|
|
@ -3442,9 +3515,12 @@ def launchd_start():
|
|||
|
||||
# Self-heal if the plist is missing entirely (e.g., manual cleanup, failed upgrade)
|
||||
if not plist_path.exists():
|
||||
new_plist = generate_launchd_plist()
|
||||
if _refuse_temp_home_service_write(new_plist, "launchd plist"):
|
||||
sys.exit(1)
|
||||
print("↻ launchd plist missing; regenerating service definition")
|
||||
plist_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
plist_path.write_text(generate_launchd_plist(), encoding="utf-8")
|
||||
plist_path.write_text(new_plist, encoding="utf-8")
|
||||
try:
|
||||
subprocess.run(
|
||||
["launchctl", "bootstrap", _launchd_domain(), str(plist_path)],
|
||||
|
|
|
|||
|
|
@ -369,6 +369,16 @@ def test_systemd_install_checks_linger_status(monkeypatch, tmp_path, capsys):
|
|||
unit_path = tmp_path / "systemd" / "user" / "hermes-gateway.service"
|
||||
|
||||
monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda system=False: unit_path)
|
||||
# Synthetic unit with a non-temp home: the real generator bakes the
|
||||
# hermetic test HERMES_HOME (a tmp dir), which the temp-home write
|
||||
# guard correctly refuses.
|
||||
monkeypatch.setattr(
|
||||
gateway,
|
||||
"generate_systemd_unit",
|
||||
lambda system=False, run_as_user=None: (
|
||||
'[Service]\nEnvironment="HERMES_HOME=/home/alice/.hermes"\n'
|
||||
),
|
||||
)
|
||||
|
||||
calls = []
|
||||
helper_calls = []
|
||||
|
|
@ -396,6 +406,15 @@ def test_systemd_install_can_skip_enable_on_startup(monkeypatch, tmp_path, capsy
|
|||
unit_path = tmp_path / "systemd" / "user" / "hermes-gateway.service"
|
||||
|
||||
monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda system=False: unit_path)
|
||||
# Non-temp home so the temp-home write guard (which trips on the
|
||||
# hermetic test HERMES_HOME) stays out of the way.
|
||||
monkeypatch.setattr(
|
||||
gateway,
|
||||
"generate_systemd_unit",
|
||||
lambda system=False, run_as_user=None: (
|
||||
'[Service]\nEnvironment="HERMES_HOME=/home/alice/.hermes"\n'
|
||||
),
|
||||
)
|
||||
|
||||
calls = []
|
||||
helper_calls = []
|
||||
|
|
|
|||
|
|
@ -102,6 +102,15 @@ def test_systemd_install_calls_linger_helper(monkeypatch, tmp_path, capsys):
|
|||
unit_path = tmp_path / "systemd" / "user" / "hermes-gateway.service"
|
||||
|
||||
monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda system=False: unit_path)
|
||||
# Non-temp home so the temp-home write guard (which trips on the
|
||||
# hermetic test HERMES_HOME) stays out of the way.
|
||||
monkeypatch.setattr(
|
||||
gateway,
|
||||
"generate_systemd_unit",
|
||||
lambda system=False, run_as_user=None: (
|
||||
'[Service]\nEnvironment="HERMES_HOME=/home/alice/.hermes"\n'
|
||||
),
|
||||
)
|
||||
|
||||
calls = []
|
||||
|
||||
|
|
|
|||
|
|
@ -289,6 +289,105 @@ class TestSystemdServiceRefresh:
|
|||
"daemon-reload" in str(c) for c in ran
|
||||
), "daemon-reload must not run when write was refused"
|
||||
|
||||
def test_refresh_refuses_to_bake_any_tempdir_home_into_real_user_unit(
|
||||
self, tmp_path, monkeypatch
|
||||
):
|
||||
"""Structural guard: a manual E2E HERMES_HOME like
|
||||
``/tmp/hermes-e2e-41264`` carries none of the pytest markers but
|
||||
poisons the unit identically (seen live 2026-06-11 — an E2E probe ran
|
||||
``hermes gateway restart`` with a /tmp HERMES_HOME exported; the
|
||||
restart's unit refresh baked it into the production unit and the
|
||||
post-update restart produced a 7-hour zombie gateway). The refresh
|
||||
must refuse ANY temp-dir HERMES_HOME, not just pytest-shaped ones.
|
||||
"""
|
||||
unit_path = tmp_path / "hermes-gateway.service"
|
||||
unit_path.write_text("old unit\n", encoding="utf-8")
|
||||
|
||||
monkeypatch.setattr(
|
||||
gateway_cli, "get_systemd_unit_path", lambda system=False: unit_path
|
||||
)
|
||||
polluted_unit = (
|
||||
"[Service]\n"
|
||||
'Environment="HERMES_HOME=/tmp/hermes-e2e-41264"\n'
|
||||
"WorkingDirectory=/tmp/hermes-e2e-41264\n"
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"generate_systemd_unit",
|
||||
lambda system=False, run_as_user=None: polluted_unit,
|
||||
)
|
||||
|
||||
ran = []
|
||||
|
||||
def fake_run(cmd, check=True, **kwargs):
|
||||
ran.append(cmd)
|
||||
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
|
||||
result = gateway_cli.refresh_systemd_unit_if_needed(system=False)
|
||||
|
||||
assert result is False, "refresh should refuse to write a temp-home unit"
|
||||
assert (
|
||||
unit_path.read_text(encoding="utf-8") == "old unit\n"
|
||||
), "installed unit must be left untouched"
|
||||
assert not any(
|
||||
"daemon-reload" in str(c) for c in ran
|
||||
), "daemon-reload must not run when write was refused"
|
||||
|
||||
|
||||
class TestTempHomeServiceDefinitionGuard:
|
||||
"""_temp_home_in_service_definition() — structural temp-dir detection."""
|
||||
|
||||
def test_detects_tmp_home_in_systemd_unit(self):
|
||||
unit = '[Service]\nEnvironment="HERMES_HOME=/tmp/hermes-e2e-41264"\n'
|
||||
assert (
|
||||
gateway_cli._temp_home_in_service_definition(unit)
|
||||
== "/tmp/hermes-e2e-41264"
|
||||
)
|
||||
|
||||
def test_detects_var_tmp_home(self):
|
||||
unit = '[Service]\nEnvironment="HERMES_HOME=/var/tmp/hermes-x"\n'
|
||||
assert gateway_cli._temp_home_in_service_definition(unit) is not None
|
||||
|
||||
def test_detects_tempdir_env_home(self, monkeypatch, tmp_path):
|
||||
import tempfile as _tempfile
|
||||
|
||||
monkeypatch.setattr(_tempfile, "gettempdir", lambda: str(tmp_path))
|
||||
unit = f'[Service]\nEnvironment="HERMES_HOME={tmp_path}/hermes-home"\n'
|
||||
assert gateway_cli._temp_home_in_service_definition(unit) is not None
|
||||
|
||||
def test_detects_tmp_home_in_launchd_plist(self):
|
||||
plist = (
|
||||
"<dict>\n <key>HERMES_HOME</key>\n"
|
||||
" <string>/tmp/hermes-e2e-99999</string>\n</dict>\n"
|
||||
)
|
||||
assert (
|
||||
gateway_cli._temp_home_in_service_definition(plist)
|
||||
== "/tmp/hermes-e2e-99999"
|
||||
)
|
||||
|
||||
def test_accepts_real_home(self):
|
||||
unit = '[Service]\nEnvironment="HERMES_HOME=/home/alice/.hermes"\n'
|
||||
assert gateway_cli._temp_home_in_service_definition(unit) is None
|
||||
|
||||
def test_accepts_macos_real_home_plist(self):
|
||||
plist = (
|
||||
"<dict>\n <key>HERMES_HOME</key>\n"
|
||||
" <string>/Users/alice/.hermes</string>\n</dict>\n"
|
||||
)
|
||||
assert gateway_cli._temp_home_in_service_definition(plist) is None
|
||||
|
||||
def test_accepts_unit_without_hermes_home(self):
|
||||
unit = "[Service]\nExecStart=/usr/bin/python -m hermes_cli.main gateway run\n"
|
||||
assert gateway_cli._temp_home_in_service_definition(unit) is None
|
||||
|
||||
def test_tmp_prefixed_non_temp_path_is_accepted(self):
|
||||
# /tmpfs-data is NOT under /tmp — prefix matching must be
|
||||
# component-wise, not string startswith.
|
||||
unit = '[Service]\nEnvironment="HERMES_HOME=/tmpfs-data/.hermes"\n'
|
||||
assert gateway_cli._temp_home_in_service_definition(unit) is None
|
||||
|
||||
|
||||
class TestRequireServiceInstalled:
|
||||
def test_exits_with_install_hint_when_unit_missing(self, tmp_path, monkeypatch, capsys):
|
||||
|
|
@ -481,6 +580,17 @@ class TestLaunchdServiceRecovery:
|
|||
plist_path.write_text("<plist>old content</plist>", encoding="utf-8")
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
|
||||
# Patch the generator with synthetic content carrying a real-looking
|
||||
# home — the temp-home guard refuses to write plists whose
|
||||
# HERMES_HOME resolves under the (pytest tmp) test HERMES_HOME.
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"generate_launchd_plist",
|
||||
lambda: (
|
||||
"<plist>--replace\n<key>HERMES_HOME</key>"
|
||||
"<string>/Users/alice/.hermes</string></plist>"
|
||||
),
|
||||
)
|
||||
|
||||
calls = []
|
||||
|
||||
|
|
@ -776,6 +886,17 @@ class TestLaunchdServiceRecovery:
|
|||
"""macOS bootstrap error 5 should spawn a detached gateway, not crash."""
|
||||
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
|
||||
# Synthetic plist with a non-temp home so the temp-home write guard
|
||||
# (which would trip on the pytest-tmp test HERMES_HOME) stays out of
|
||||
# the way — this test exercises the bootstrap-error fallback.
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"generate_launchd_plist",
|
||||
lambda: (
|
||||
"<plist><key>HERMES_HOME</key>"
|
||||
"<string>/Users/alice/.hermes</string></plist>"
|
||||
),
|
||||
)
|
||||
|
||||
def fake_run(cmd, check=False, **kwargs):
|
||||
if cmd[:2] == ["launchctl", "bootstrap"]:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue