mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-13 09:01:54 +00:00
fix(gateway): refuse to write service definitions with a temp-dir HERMES_HOME (#44267)
* fix(gateway): refuse to write service definitions with a temp-dir HERMES_HOME A test/E2E harness that exports HERMES_HOME=/tmp/... and touches any gateway service write path (install, start self-heal, restart's refresh_systemd_unit_if_needed) bakes the throwaway home into the production systemd unit / launchd plist. The gateway then restarts 'healthy' but pointed at an empty temp home — no platforms enabled, deaf to every message (live incident 2026-06-11: /tmp/hermes-e2e-41264 poisoned the unit during a PR-review E2E probe; the post-update restart produced a 7-hour zombie gateway). The existing safety belt only sniffed pytest-shaped markers (/pytest-of-, /hermes_test). Add a structural guard: _temp_home_in_service_definition() extracts HERMES_HOME from the generated systemd unit or launchd plist and refuses the write (with actionable guidance) when it resolves under tempfile.gettempdir(), /tmp, /var/tmp, or the macOS /private variants. Wired into all five write sites: systemd refresh + install, launchd refresh + install + start self-heal. * test: patch unit generator in install tests tripped by temp-home guard CI runs hermetic with HERMES_HOME under a tmp dir, so the real generate_systemd_unit() output now (correctly) trips the new temp-home write guard in three install tests. Patch the generator with synthetic non-temp content — same pattern the existing pytest-marker guard tests use.
This commit is contained in:
parent
8972a151a4
commit
f456f302df
4 changed files with 231 additions and 6 deletions
|
|
@ -2531,6 +2531,65 @@ def systemd_unit_is_current(system: bool = False) -> bool:
|
|||
return norm_installed == norm_expected
|
||||
|
||||
|
||||
def _temp_home_in_service_definition(definition: str) -> str | None:
|
||||
"""Return the temp-dir HERMES_HOME baked into a service definition, or None.
|
||||
|
||||
A generated systemd unit / launchd plist carries the resolved HERMES_HOME
|
||||
in its environment block. If that path lives under the system temp dir,
|
||||
the definition was almost certainly generated by a test/E2E harness that
|
||||
exported a throwaway ``HERMES_HOME=/tmp/...`` — writing it to the real
|
||||
service file silently breaks the user's gateway on the next (re)start:
|
||||
the gateway comes back "active (running)" but pointed at an empty temp
|
||||
home ("No messaging platforms enabled"), deaf to every platform.
|
||||
Seen live 2026-06-11: an E2E guard probe ran ``hermes gateway restart``
|
||||
with ``HERMES_HOME=/tmp/hermes-e2e-<pr>`` exported; the restart path's
|
||||
unit refresh baked the temp path into the production unit and the
|
||||
post-update restart produced a zombie gateway for 7+ hours.
|
||||
|
||||
Matches both systemd ``Environment="HERMES_HOME=..."`` lines and launchd
|
||||
``<key>HERMES_HOME</key><string>...</string>`` pairs.
|
||||
"""
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
candidates = re.findall(r'HERMES_HOME=([^"\n]+)', definition)
|
||||
candidates += re.findall(
|
||||
r"<key>HERMES_HOME</key>\s*<string>(.*?)</string>", definition, flags=re.S
|
||||
)
|
||||
temp_roots = {
|
||||
Path(tempfile.gettempdir()).resolve(),
|
||||
Path("/tmp"),
|
||||
Path("/var/tmp"),
|
||||
Path("/private/tmp"),
|
||||
Path("/private/var/tmp"),
|
||||
}
|
||||
for raw in candidates:
|
||||
try:
|
||||
resolved = Path(raw.strip().strip('"')).resolve()
|
||||
except (OSError, ValueError):
|
||||
continue
|
||||
for root in temp_roots:
|
||||
if resolved == root or root in resolved.parents:
|
||||
return raw.strip()
|
||||
return None
|
||||
|
||||
|
||||
def _refuse_temp_home_service_write(definition: str, kind: str) -> bool:
|
||||
"""Refuse (with guidance) when a service definition carries a temp HERMES_HOME."""
|
||||
temp_home = _temp_home_in_service_definition(definition)
|
||||
if temp_home is None:
|
||||
return False
|
||||
print(
|
||||
f"✗ Refusing to write the gateway {kind}: HERMES_HOME resolves to a "
|
||||
f"temporary directory ({temp_home})."
|
||||
)
|
||||
print(
|
||||
" This usually means a test/E2E environment exported HERMES_HOME. "
|
||||
"Unset it (or run from a clean shell) and retry."
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def refresh_systemd_unit_if_needed(system: bool = False) -> bool:
|
||||
"""Rewrite the installed systemd unit when the generated definition has changed."""
|
||||
unit_path = get_systemd_unit_path(system=system)
|
||||
|
|
@ -2561,6 +2620,12 @@ def refresh_systemd_unit_if_needed(system: bool = False) -> bool:
|
|||
):
|
||||
return False
|
||||
|
||||
# Structural variant of the same belt: refuse to bake ANY temp-dir
|
||||
# HERMES_HOME into the unit (manual E2E homes like /tmp/hermes-e2e-NNN
|
||||
# don't carry the pytest markers above but poison the unit identically).
|
||||
if _refuse_temp_home_service_write(new_unit, "systemd unit"):
|
||||
return False
|
||||
|
||||
unit_path.write_text(new_unit, encoding="utf-8")
|
||||
_run_systemctl(["daemon-reload"], system=system, check=True, timeout=30)
|
||||
print(
|
||||
|
|
@ -2729,10 +2794,11 @@ def systemd_install(
|
|||
return
|
||||
|
||||
unit_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
new_unit = generate_systemd_unit(system=system, run_as_user=run_as_user)
|
||||
if _refuse_temp_home_service_write(new_unit, "systemd unit"):
|
||||
return
|
||||
print(f"Installing {_service_scope_label(system)} systemd service to: {unit_path}")
|
||||
unit_path.write_text(
|
||||
generate_systemd_unit(system=system, run_as_user=run_as_user), encoding="utf-8"
|
||||
)
|
||||
unit_path.write_text(new_unit, encoding="utf-8")
|
||||
|
||||
_run_systemctl(["daemon-reload"], system=system, check=True, timeout=30)
|
||||
if enable_on_startup:
|
||||
|
|
@ -3362,7 +3428,11 @@ def refresh_launchd_plist_if_needed() -> bool:
|
|||
if not plist_path.exists() or launchd_plist_is_current():
|
||||
return False
|
||||
|
||||
plist_path.write_text(generate_launchd_plist(), encoding="utf-8")
|
||||
new_plist = generate_launchd_plist()
|
||||
if _refuse_temp_home_service_write(new_plist, "launchd plist"):
|
||||
return False
|
||||
|
||||
plist_path.write_text(new_plist, encoding="utf-8")
|
||||
label = get_launchd_label()
|
||||
# Bootout/bootstrap so launchd picks up the new definition
|
||||
subprocess.run(
|
||||
|
|
@ -3395,8 +3465,11 @@ def launchd_install(force: bool = False):
|
|||
return
|
||||
|
||||
plist_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
new_plist = generate_launchd_plist()
|
||||
if _refuse_temp_home_service_write(new_plist, "launchd plist"):
|
||||
return
|
||||
print(f"Installing launchd service to: {plist_path}")
|
||||
plist_path.write_text(generate_launchd_plist())
|
||||
plist_path.write_text(new_plist)
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
|
|
@ -3442,9 +3515,12 @@ def launchd_start():
|
|||
|
||||
# Self-heal if the plist is missing entirely (e.g., manual cleanup, failed upgrade)
|
||||
if not plist_path.exists():
|
||||
new_plist = generate_launchd_plist()
|
||||
if _refuse_temp_home_service_write(new_plist, "launchd plist"):
|
||||
sys.exit(1)
|
||||
print("↻ launchd plist missing; regenerating service definition")
|
||||
plist_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
plist_path.write_text(generate_launchd_plist(), encoding="utf-8")
|
||||
plist_path.write_text(new_plist, encoding="utf-8")
|
||||
try:
|
||||
subprocess.run(
|
||||
["launchctl", "bootstrap", _launchd_domain(), str(plist_path)],
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue