From af6f9bc2a12682b06fb3632acf5a9cbf01e74a85 Mon Sep 17 00:00:00 2001 From: Siddharth Balyan <52913345+alt-glitch@users.noreply.github.com> Date: Mon, 4 May 2026 16:27:51 +0530 Subject: [PATCH] fix: refresh systemd unit on gateway boot (not just start/restart) (#19684) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The resilient restart settings from PR #18639 only took effect when the gateway was started via `hermes gateway start` or `hermes gateway restart` — both of which call refresh_systemd_unit_if_needed() which writes the new unit and runs daemon-reload. However, when the gateway self-restarts via exit-code-75 (stale-code detection after `hermes update`, or the /restart command), systemd respawns the process directly without going through any CLI function. The unit file on disk stays stale, and systemd keeps using the old cached settings (StartLimitBurst=5, RestartSec=30) until someone manually runs `hermes gateway restart`. This meant that after PR #18639 was deployed, users who never ran `hermes gateway restart` manually were still vulnerable to the permanent-death-on-network-outage bug. Fix: call refresh_systemd_unit_if_needed() at the top of run_gateway() (the foreground entry point that systemd's ExecStart invokes). This ensures that on every boot — whether triggered by systemd restart, exit-75 respawn, or manual foreground run — the unit definition and daemon state are current. The call is best-effort (exceptions caught) and a no-op when the unit is already current (one stat + string compare). --- hermes_cli/gateway.py | 14 ++++++++++ tests/hermes_cli/test_gateway_service.py | 34 ++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index c7abea5bad..7dec83cbff 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -2492,6 +2492,20 @@ def run_gateway(verbose: int = 0, quiet: bool = False, replace: bool = False): hasn't fully exited yet. """ sys.path.insert(0, str(PROJECT_ROOT)) + + # Refresh the systemd unit definition on every boot so that restart + # settings (RestartSec, StartLimitIntervalSec, etc.) stay current even + # when the process was respawned via exit-code-75 (stale-code or + # /restart) rather than through `hermes gateway restart` which already + # calls refresh_systemd_unit_if_needed(). Without this, a code update + # that ships new unit settings won't take effect until the next manual + # `hermes gateway start/restart` — leaving the gateway vulnerable to + # the exact failure mode the new settings were meant to prevent. + if supports_systemd_services(): + try: + refresh_systemd_unit_if_needed(system=False) + except Exception: + pass # best-effort; don't block gateway startup from gateway.run import start_gateway diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py index f2bfa8b870..a2e3869c8c 100644 --- a/tests/hermes_cli/test_gateway_service.py +++ b/tests/hermes_cli/test_gateway_service.py @@ -107,6 +107,40 @@ class TestSystemdServiceRefresh: ] + def test_run_gateway_refreshes_outdated_unit_on_boot(self, tmp_path, monkeypatch): + """run_gateway() should refresh the systemd unit on boot so that + restart settings take effect even when the process was respawned + via exit-code-75 (bypassing `hermes gateway restart`).""" + unit_path = tmp_path / "hermes-gateway.service" + unit_path.write_text("old unit\n", encoding="utf-8") + + monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda system=False: unit_path) + monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda system=False, run_as_user=None: "new unit\n") + monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True) + + calls = [] + + def fake_run(cmd, check=True, **kwargs): + calls.append(cmd) + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run) + + # Prevent run_gateway from actually starting the gateway + def fake_start_gateway(**kwargs): + import asyncio + f = asyncio.Future() + f.set_result(True) + return f + + monkeypatch.setattr("gateway.run.start_gateway", fake_start_gateway) + + gateway_cli.run_gateway() + + assert unit_path.read_text(encoding="utf-8") == "new unit\n" + assert ["systemctl", "--user", "daemon-reload"] in calls + + class TestGeneratedSystemdUnits: def test_user_unit_avoids_recursive_execstop_and_uses_extended_stop_timeout(self): unit = gateway_cli.generate_systemd_unit(system=False)