fix(gateway): refuse to write service definitions with a temp-dir HERMES_HOME (#44267)

* fix(gateway): refuse to write service definitions with a temp-dir HERMES_HOME

A test/E2E harness that exports HERMES_HOME=/tmp/... and touches any
gateway service write path (install, start self-heal, restart's
refresh_systemd_unit_if_needed) bakes the throwaway home into the
production systemd unit / launchd plist. The gateway then restarts
'healthy' but pointed at an empty temp home — no platforms enabled,
deaf to every message (live incident 2026-06-11: /tmp/hermes-e2e-41264
poisoned the unit during a PR-review E2E probe; the post-update restart
produced a 7-hour zombie gateway).

The existing safety belt only sniffed pytest-shaped markers
(/pytest-of-, /hermes_test). Add a structural guard:
_temp_home_in_service_definition() extracts HERMES_HOME from the
generated systemd unit or launchd plist and refuses the write (with
actionable guidance) when it resolves under tempfile.gettempdir(),
/tmp, /var/tmp, or the macOS /private variants. Wired into all five
write sites: systemd refresh + install, launchd refresh + install +
start self-heal.

* test: patch unit generator in install tests tripped by temp-home guard

CI runs hermetic with HERMES_HOME under a tmp dir, so the real
generate_systemd_unit() output now (correctly) trips the new temp-home
write guard in three install tests. Patch the generator with synthetic
non-temp content — same pattern the existing pytest-marker guard tests
use.
This commit is contained in:
Teknium 2026-06-11 06:10:08 -07:00 committed by GitHub
parent 8972a151a4
commit f456f302df
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 231 additions and 6 deletions

View file

@ -2531,6 +2531,65 @@ def systemd_unit_is_current(system: bool = False) -> bool:
return norm_installed == norm_expected
def _temp_home_in_service_definition(definition: str) -> str | None:
"""Return the temp-dir HERMES_HOME baked into a service definition, or None.
A generated systemd unit / launchd plist carries the resolved HERMES_HOME
in its environment block. If that path lives under the system temp dir,
the definition was almost certainly generated by a test/E2E harness that
exported a throwaway ``HERMES_HOME=/tmp/...`` writing it to the real
service file silently breaks the user's gateway on the next (re)start:
the gateway comes back "active (running)" but pointed at an empty temp
home ("No messaging platforms enabled"), deaf to every platform.
Seen live 2026-06-11: an E2E guard probe ran ``hermes gateway restart``
with ``HERMES_HOME=/tmp/hermes-e2e-<pr>`` exported; the restart path's
unit refresh baked the temp path into the production unit and the
post-update restart produced a zombie gateway for 7+ hours.
Matches both systemd ``Environment="HERMES_HOME=..."`` lines and launchd
``<key>HERMES_HOME</key><string>...</string>`` pairs.
"""
import re
import tempfile
candidates = re.findall(r'HERMES_HOME=([^"\n]+)', definition)
candidates += re.findall(
r"<key>HERMES_HOME</key>\s*<string>(.*?)</string>", definition, flags=re.S
)
temp_roots = {
Path(tempfile.gettempdir()).resolve(),
Path("/tmp"),
Path("/var/tmp"),
Path("/private/tmp"),
Path("/private/var/tmp"),
}
for raw in candidates:
try:
resolved = Path(raw.strip().strip('"')).resolve()
except (OSError, ValueError):
continue
for root in temp_roots:
if resolved == root or root in resolved.parents:
return raw.strip()
return None
def _refuse_temp_home_service_write(definition: str, kind: str) -> bool:
"""Refuse (with guidance) when a service definition carries a temp HERMES_HOME."""
temp_home = _temp_home_in_service_definition(definition)
if temp_home is None:
return False
print(
f"✗ Refusing to write the gateway {kind}: HERMES_HOME resolves to a "
f"temporary directory ({temp_home})."
)
print(
" This usually means a test/E2E environment exported HERMES_HOME. "
"Unset it (or run from a clean shell) and retry."
)
return True
def refresh_systemd_unit_if_needed(system: bool = False) -> bool:
"""Rewrite the installed systemd unit when the generated definition has changed."""
unit_path = get_systemd_unit_path(system=system)
@ -2561,6 +2620,12 @@ def refresh_systemd_unit_if_needed(system: bool = False) -> bool:
):
return False
# Structural variant of the same belt: refuse to bake ANY temp-dir
# HERMES_HOME into the unit (manual E2E homes like /tmp/hermes-e2e-NNN
# don't carry the pytest markers above but poison the unit identically).
if _refuse_temp_home_service_write(new_unit, "systemd unit"):
return False
unit_path.write_text(new_unit, encoding="utf-8")
_run_systemctl(["daemon-reload"], system=system, check=True, timeout=30)
print(
@ -2729,10 +2794,11 @@ def systemd_install(
return
unit_path.parent.mkdir(parents=True, exist_ok=True)
new_unit = generate_systemd_unit(system=system, run_as_user=run_as_user)
if _refuse_temp_home_service_write(new_unit, "systemd unit"):
return
print(f"Installing {_service_scope_label(system)} systemd service to: {unit_path}")
unit_path.write_text(
generate_systemd_unit(system=system, run_as_user=run_as_user), encoding="utf-8"
)
unit_path.write_text(new_unit, encoding="utf-8")
_run_systemctl(["daemon-reload"], system=system, check=True, timeout=30)
if enable_on_startup:
@ -3362,7 +3428,11 @@ def refresh_launchd_plist_if_needed() -> bool:
if not plist_path.exists() or launchd_plist_is_current():
return False
plist_path.write_text(generate_launchd_plist(), encoding="utf-8")
new_plist = generate_launchd_plist()
if _refuse_temp_home_service_write(new_plist, "launchd plist"):
return False
plist_path.write_text(new_plist, encoding="utf-8")
label = get_launchd_label()
# Bootout/bootstrap so launchd picks up the new definition
subprocess.run(
@ -3395,8 +3465,11 @@ def launchd_install(force: bool = False):
return
plist_path.parent.mkdir(parents=True, exist_ok=True)
new_plist = generate_launchd_plist()
if _refuse_temp_home_service_write(new_plist, "launchd plist"):
return
print(f"Installing launchd service to: {plist_path}")
plist_path.write_text(generate_launchd_plist())
plist_path.write_text(new_plist)
try:
subprocess.run(
@ -3442,9 +3515,12 @@ def launchd_start():
# Self-heal if the plist is missing entirely (e.g., manual cleanup, failed upgrade)
if not plist_path.exists():
new_plist = generate_launchd_plist()
if _refuse_temp_home_service_write(new_plist, "launchd plist"):
sys.exit(1)
print("↻ launchd plist missing; regenerating service definition")
plist_path.parent.mkdir(parents=True, exist_ok=True)
plist_path.write_text(generate_launchd_plist(), encoding="utf-8")
plist_path.write_text(new_plist, encoding="utf-8")
try:
subprocess.run(
["launchctl", "bootstrap", _launchd_domain(), str(plist_path)],