diff --git a/hermes_cli/main.py b/hermes_cli/main.py index babcad7191..9ad4d0142b 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -3939,6 +3939,26 @@ def cmd_update(args): print() print("✓ Update complete!") + # Write exit code *before* the gateway restart attempt. + # When running as ``hermes update --gateway`` (spawned by the gateway's + # /update command), this process lives inside the gateway's systemd + # cgroup. ``systemctl restart hermes-gateway`` kills everything in the + # cgroup (KillMode=mixed → SIGKILL to remaining processes), including + # us and the wrapping bash shell. The shell never reaches its + # ``printf $status > .update_exit_code`` epilogue, so the exit-code + # marker file is never created. The new gateway's update watcher then + # polls for 30 minutes and sends a spurious timeout message. + # + # Writing the marker here — after git pull + pip install succeed but + # before we attempt the restart — ensures the new gateway sees it + # regardless of how we die. + if gateway_mode: + _exit_code_path = get_hermes_home() / ".update_exit_code" + try: + _exit_code_path.write_text("0") + except OSError: + pass + # Auto-restart ALL gateways after update. # The code update (git pull) is shared across all profiles, so every # running gateway needs restarting to pick up the new code. diff --git a/tests/hermes_cli/test_update_gateway_restart.py b/tests/hermes_cli/test_update_gateway_restart.py index 822b22742d..f3f2a0444a 100644 --- a/tests/hermes_cli/test_update_gateway_restart.py +++ b/tests/hermes_cli/test_update_gateway_restart.py @@ -798,3 +798,120 @@ class TestFindGatewayPidsExclude: pids = gateway_cli.find_gateway_pids() assert pids == [100] + + +# --------------------------------------------------------------------------- +# Gateway mode writes exit code before restart (#8300) +# --------------------------------------------------------------------------- + + +class TestGatewayModeWritesExitCodeEarly: + """When running as ``hermes update --gateway``, the exit code marker must be + written *before* the gateway restart attempt. Without this, systemd's + ``KillMode=mixed`` kills the update process (and its wrapping shell) during + the cgroup teardown, so the shell epilogue that normally writes the exit + code never executes. The new gateway's update watcher then polls for 30 + minutes and sends a spurious timeout message. + """ + + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_exit_code_written_in_gateway_mode( + self, mock_run, _mock_which, capsys, tmp_path, monkeypatch, + ): + monkeypatch.setattr(gateway_cli, "is_macos", lambda: False) + monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: False) + monkeypatch.setattr(gateway_cli, "is_termux", lambda: False) + + # Point HERMES_HOME at a temp dir so the marker file lands there + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + import hermes_cli.config as _cfg + monkeypatch.setattr(_cfg, "get_hermes_home", lambda: hermes_home) + # Also patch the module-level ref used by cmd_update + import hermes_cli.main as _main_mod + monkeypatch.setattr(_main_mod, "get_hermes_home", lambda: hermes_home) + + mock_run.side_effect = _make_run_side_effect(commit_count="1") + + args = SimpleNamespace(gateway=True) + + with patch.object(gateway_cli, "find_gateway_pids", return_value=[]): + cmd_update(args) + + exit_code_path = hermes_home / ".update_exit_code" + assert exit_code_path.exists(), ".update_exit_code not written in gateway mode" + assert exit_code_path.read_text() == "0" + + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_exit_code_not_written_in_normal_mode( + self, mock_run, _mock_which, capsys, tmp_path, monkeypatch, + ): + """Non-gateway mode should NOT write the exit code (the shell does it).""" + monkeypatch.setattr(gateway_cli, "is_macos", lambda: False) + monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: False) + monkeypatch.setattr(gateway_cli, "is_termux", lambda: False) + + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + import hermes_cli.config as _cfg + monkeypatch.setattr(_cfg, "get_hermes_home", lambda: hermes_home) + import hermes_cli.main as _main_mod + monkeypatch.setattr(_main_mod, "get_hermes_home", lambda: hermes_home) + + mock_run.side_effect = _make_run_side_effect(commit_count="1") + + args = SimpleNamespace(gateway=False) + + with patch.object(gateway_cli, "find_gateway_pids", return_value=[]): + cmd_update(args) + + exit_code_path = hermes_home / ".update_exit_code" + assert not exit_code_path.exists(), ".update_exit_code should not be written outside gateway mode" + + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_exit_code_written_before_restart_call( + self, mock_run, _mock_which, capsys, tmp_path, monkeypatch, + ): + """Exit code must exist BEFORE systemctl restart is called.""" + monkeypatch.setattr(gateway_cli, "is_macos", lambda: False) + monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True) + monkeypatch.setattr(gateway_cli, "is_termux", lambda: False) + + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + import hermes_cli.config as _cfg + monkeypatch.setattr(_cfg, "get_hermes_home", lambda: hermes_home) + import hermes_cli.main as _main_mod + monkeypatch.setattr(_main_mod, "get_hermes_home", lambda: hermes_home) + + exit_code_path = hermes_home / ".update_exit_code" + + # Track whether exit code exists when systemctl restart is called + exit_code_existed_at_restart = [] + + original_side_effect = _make_run_side_effect( + commit_count="1", systemd_active=True, + ) + + def tracking_side_effect(cmd, **kwargs): + joined = " ".join(str(c) for c in cmd) + if "systemctl" in joined and "restart" in joined: + exit_code_existed_at_restart.append(exit_code_path.exists()) + return original_side_effect(cmd, **kwargs) + + mock_run.side_effect = tracking_side_effect + + args = SimpleNamespace(gateway=True) + + with patch.object(gateway_cli, "find_gateway_pids", return_value=[]): + cmd_update(args) + + assert exit_code_existed_at_restart, "systemctl restart was never called" + assert exit_code_existed_at_restart[0] is True, \ + ".update_exit_code must exist BEFORE systemctl restart (cgroup kill race)"