mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-07 02:51:50 +00:00
fix: refresh systemd unit on gateway boot (not just start/restart) (#19684)
The resilient restart settings from PR #18639 only took effect when the gateway was started via `hermes gateway start` or `hermes gateway restart` — both of which call refresh_systemd_unit_if_needed() which writes the new unit and runs daemon-reload. However, when the gateway self-restarts via exit-code-75 (stale-code detection after `hermes update`, or the /restart command), systemd respawns the process directly without going through any CLI function. The unit file on disk stays stale, and systemd keeps using the old cached settings (StartLimitBurst=5, RestartSec=30) until someone manually runs `hermes gateway restart`. This meant that after PR #18639 was deployed, users who never ran `hermes gateway restart` manually were still vulnerable to the permanent-death-on-network-outage bug. Fix: call refresh_systemd_unit_if_needed() at the top of run_gateway() (the foreground entry point that systemd's ExecStart invokes). This ensures that on every boot — whether triggered by systemd restart, exit-75 respawn, or manual foreground run — the unit definition and daemon state are current. The call is best-effort (exceptions caught) and a no-op when the unit is already current (one stat + string compare).
This commit is contained in:
parent
33f554d83c
commit
af6f9bc2a1
2 changed files with 48 additions and 0 deletions
|
|
@ -2493,6 +2493,20 @@ def run_gateway(verbose: int = 0, quiet: bool = False, replace: bool = False):
|
||||||
"""
|
"""
|
||||||
sys.path.insert(0, str(PROJECT_ROOT))
|
sys.path.insert(0, str(PROJECT_ROOT))
|
||||||
|
|
||||||
|
# Refresh the systemd unit definition on every boot so that restart
|
||||||
|
# settings (RestartSec, StartLimitIntervalSec, etc.) stay current even
|
||||||
|
# when the process was respawned via exit-code-75 (stale-code or
|
||||||
|
# /restart) rather than through `hermes gateway restart` which already
|
||||||
|
# calls refresh_systemd_unit_if_needed(). Without this, a code update
|
||||||
|
# that ships new unit settings won't take effect until the next manual
|
||||||
|
# `hermes gateway start/restart` — leaving the gateway vulnerable to
|
||||||
|
# the exact failure mode the new settings were meant to prevent.
|
||||||
|
if supports_systemd_services():
|
||||||
|
try:
|
||||||
|
refresh_systemd_unit_if_needed(system=False)
|
||||||
|
except Exception:
|
||||||
|
pass # best-effort; don't block gateway startup
|
||||||
|
|
||||||
from gateway.run import start_gateway
|
from gateway.run import start_gateway
|
||||||
|
|
||||||
print("┌─────────────────────────────────────────────────────────┐")
|
print("┌─────────────────────────────────────────────────────────┐")
|
||||||
|
|
|
||||||
|
|
@ -107,6 +107,40 @@ class TestSystemdServiceRefresh:
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_gateway_refreshes_outdated_unit_on_boot(self, tmp_path, monkeypatch):
|
||||||
|
"""run_gateway() should refresh the systemd unit on boot so that
|
||||||
|
restart settings take effect even when the process was respawned
|
||||||
|
via exit-code-75 (bypassing `hermes gateway restart`)."""
|
||||||
|
unit_path = tmp_path / "hermes-gateway.service"
|
||||||
|
unit_path.write_text("old unit\n", encoding="utf-8")
|
||||||
|
|
||||||
|
monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda system=False: unit_path)
|
||||||
|
monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda system=False, run_as_user=None: "new unit\n")
|
||||||
|
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
|
||||||
|
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
def fake_run(cmd, check=True, **kwargs):
|
||||||
|
calls.append(cmd)
|
||||||
|
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||||
|
|
||||||
|
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||||
|
|
||||||
|
# Prevent run_gateway from actually starting the gateway
|
||||||
|
def fake_start_gateway(**kwargs):
|
||||||
|
import asyncio
|
||||||
|
f = asyncio.Future()
|
||||||
|
f.set_result(True)
|
||||||
|
return f
|
||||||
|
|
||||||
|
monkeypatch.setattr("gateway.run.start_gateway", fake_start_gateway)
|
||||||
|
|
||||||
|
gateway_cli.run_gateway()
|
||||||
|
|
||||||
|
assert unit_path.read_text(encoding="utf-8") == "new unit\n"
|
||||||
|
assert ["systemctl", "--user", "daemon-reload"] in calls
|
||||||
|
|
||||||
|
|
||||||
class TestGeneratedSystemdUnits:
|
class TestGeneratedSystemdUnits:
|
||||||
def test_user_unit_avoids_recursive_execstop_and_uses_extended_stop_timeout(self):
|
def test_user_unit_avoids_recursive_execstop_and_uses_extended_stop_timeout(self):
|
||||||
unit = gateway_cli.generate_systemd_unit(system=False)
|
unit = gateway_cli.generate_systemd_unit(system=False)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue