diff --git a/gateway/run.py b/gateway/run.py index 2c377980c..3eb932cc2 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -10373,9 +10373,9 @@ class GatewayRunner: # Periodic "still working" notifications for long-running tasks. # Fires every N seconds so the user knows the agent hasn't died. # Config: agent.gateway_notify_interval in config.yaml, or - # HERMES_AGENT_NOTIFY_INTERVAL env var. Default 600s (10 min). + # HERMES_AGENT_NOTIFY_INTERVAL env var. Default 180s (3 min). # 0 = disable notifications. - _NOTIFY_INTERVAL_RAW = float(os.getenv("HERMES_AGENT_NOTIFY_INTERVAL", 600)) + _NOTIFY_INTERVAL_RAW = float(os.getenv("HERMES_AGENT_NOTIFY_INTERVAL", 180)) _NOTIFY_INTERVAL = _NOTIFY_INTERVAL_RAW if _NOTIFY_INTERVAL_RAW > 0 else None _notify_start = time.time() diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 36e478a70..cfcc7ff28 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -384,7 +384,11 @@ DEFAULT_CONFIG = { # Periodic "still working" notification interval (seconds). # Sends a status message every N seconds so the user knows the # agent hasn't died during long tasks. 0 = disable notifications. - "gateway_notify_interval": 600, + # Lower values mean faster feedback on slow tasks but more chat + # noise; 180s is a compromise that catches spinning weak-model runs + # (60+ tool iterations with tiny output) before users assume the + # bot is dead and /restart. + "gateway_notify_interval": 180, }, "terminal": { diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 9773299d5..3b828fecf 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -175,6 +175,60 @@ def _request_gateway_self_restart(pid: int) -> bool: return True +def _graceful_restart_via_sigusr1(pid: int, drain_timeout: float) -> bool: + """Send SIGUSR1 to a gateway PID and wait for it to exit gracefully. + + SIGUSR1 is wired in gateway/run.py to ``request_restart(via_service=True)`` + which drains in-flight agent runs (up to ``agent.restart_drain_timeout`` + seconds), then exits with code 75. Both systemd (``Restart=on-failure`` + + ``RestartForceExitStatus=75``) and launchd (``KeepAlive.SuccessfulExit + = false``) relaunch the process after the graceful exit. + + This is the drain-aware alternative to ``systemctl restart`` / ``SIGTERM``, + which SIGKILL in-flight agents after a short timeout. + + Args: + pid: Gateway process PID (systemd MainPID, launchd PID, or bare + process PID). + drain_timeout: Seconds to wait for the process to exit after sending + SIGUSR1. Should be slightly larger than the gateway's + ``agent.restart_drain_timeout`` to allow the drain loop to + finish cleanly. + + Returns: + True if the PID was signalled and exited within the timeout. + False if SIGUSR1 couldn't be sent or the process didn't exit in + time (caller should fall back to a harder restart path). + """ + if not hasattr(signal, "SIGUSR1"): + return False + if pid <= 0: + return False + try: + os.kill(pid, signal.SIGUSR1) + except ProcessLookupError: + # Already gone — nothing to drain. + return True + except (PermissionError, OSError): + return False + + import time as _time + + deadline = _time.monotonic() + max(drain_timeout, 1.0) + while _time.monotonic() < deadline: + try: + os.kill(pid, 0) # signal 0 — probe liveness + except ProcessLookupError: + return True + except PermissionError: + # Process still exists but we can't signal it. Treat as alive + # so the caller falls back. + pass + _time.sleep(0.5) + # Drain didn't finish in time. + return False + + def _append_unique_pid(pids: list[int], pid: int | None, exclude_pids: set[int]) -> None: if pid is None or pid <= 0: return diff --git a/hermes_cli/main.py b/hermes_cli/main.py index cb70261b4..d7de30960 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -5864,12 +5864,15 @@ def _cmd_update_impl(args, gateway_mode: bool): # Write exit code *before* the gateway restart attempt. # When running as ``hermes update --gateway`` (spawned by the gateway's # /update command), this process lives inside the gateway's systemd - # cgroup. ``systemctl restart hermes-gateway`` kills everything in the - # cgroup (KillMode=mixed → SIGKILL to remaining processes), including - # us and the wrapping bash shell. The shell never reaches its - # ``printf $status > .update_exit_code`` epilogue, so the exit-code - # marker file is never created. The new gateway's update watcher then - # polls for 30 minutes and sends a spurious timeout message. + # cgroup. A graceful SIGUSR1 restart keeps the drain loop alive long + # enough for the exit-code marker to be written below, but the + # fallback ``systemctl restart`` path (see below) kills everything in + # the cgroup (KillMode=mixed → SIGKILL to remaining processes), + # including us and the wrapping bash shell. The shell never reaches + # its ``printf $status > .update_exit_code`` epilogue, so the + # exit-code marker file would never be created. The new gateway's + # update watcher would then poll for 30 minutes and send a spurious + # timeout message. # # Writing the marker here — after git pull + pip install succeed but # before we attempt the restart — ensures the new gateway sees it @@ -5891,9 +5894,37 @@ def _cmd_update_impl(args, gateway_mode: bool): _ensure_user_systemd_env, find_gateway_pids, _get_service_pids, + _graceful_restart_via_sigusr1, ) import signal as _signal + # Drain budget for graceful SIGUSR1 restarts. The gateway drains + # for up to ``agent.restart_drain_timeout`` (default 60s) before + # exiting with code 75; we wait slightly longer so the drain + # completes before we fall back to a hard restart. On older + # systemd units without SIGUSR1 wiring this wait just times out + # and we fall back to ``systemctl restart`` (the old behaviour). + try: + from hermes_constants import ( + DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT as _DEFAULT_DRAIN, + ) + except Exception: + _DEFAULT_DRAIN = 60.0 + _cfg_drain = None + try: + from hermes_cli.config import load_config + _cfg_agent = (load_config().get("agent") or {}) + _cfg_drain = _cfg_agent.get("restart_drain_timeout") + except Exception: + pass + try: + _drain_budget = float(_cfg_drain) if _cfg_drain is not None else float(_DEFAULT_DRAIN) + except (TypeError, ValueError): + _drain_budget = float(_DEFAULT_DRAIN) + # Add a 15s margin so the drain loop + final exit finish before + # we escalate to ``systemctl restart`` / SIGTERM. + _drain_budget = max(_drain_budget, 30.0) + 15.0 + restarted_services = [] killed_pids = set() @@ -5940,59 +5971,114 @@ def _cmd_update_impl(args, gateway_mode: bool): text=True, timeout=5, ) - if check.stdout.strip() == "active": - restart = subprocess.run( - scope_cmd + ["restart", svc_name], + if check.stdout.strip() != "active": + continue + + # Prefer a graceful SIGUSR1 restart so in-flight + # agent runs drain instead of being SIGKILLed. + # The gateway's SIGUSR1 handler calls + # request_restart(via_service=True) → drain → + # exit(75); systemd's Restart=on-failure (and + # RestartForceExitStatus=75) respawns the unit. + _main_pid = 0 + try: + _show = subprocess.run( + scope_cmd + [ + "show", svc_name, + "--property=MainPID", "--value", + ], + capture_output=True, text=True, timeout=5, + ) + _main_pid = int((_show.stdout or "").strip() or 0) + except (ValueError, subprocess.TimeoutExpired, FileNotFoundError): + _main_pid = 0 + + _graceful_ok = False + if _main_pid > 0: + print( + f" → {svc_name}: draining (up to {int(_drain_budget)}s)..." + ) + _graceful_ok = _graceful_restart_via_sigusr1( + _main_pid, drain_timeout=_drain_budget, + ) + + if _graceful_ok: + # Gateway exited 75; systemd should relaunch + # via Restart=on-failure. Verify the new + # process came up. + _time.sleep(3) + verify = subprocess.run( + scope_cmd + ["is-active", svc_name], + capture_output=True, text=True, timeout=5, + ) + if verify.stdout.strip() == "active": + restarted_services.append(svc_name) + continue + # Process exited but wasn't respawned (older + # unit without Restart=on-failure or + # RestartForceExitStatus=75). Fall through + # to systemctl start/restart. + print( + f" ⚠ {svc_name} drained but didn't relaunch — forcing restart" + ) + + # Fallback: blunt systemctl restart. This is + # what the old code always did; we get here only + # when the graceful path failed (unit missing + # SIGUSR1 wiring, drain exceeded the budget, + # restart-policy mismatch). + restart = subprocess.run( + scope_cmd + ["restart", svc_name], + capture_output=True, + text=True, + timeout=15, + ) + if restart.returncode == 0: + # Verify the service actually survived the + # restart. systemctl restart returns 0 even + # if the new process crashes immediately. + _time.sleep(3) + verify = subprocess.run( + scope_cmd + ["is-active", svc_name], capture_output=True, text=True, - timeout=15, + timeout=5, ) - if restart.returncode == 0: - # Verify the service actually survived the - # restart. systemctl restart returns 0 even - # if the new process crashes immediately. + if verify.stdout.strip() == "active": + restarted_services.append(svc_name) + else: + # Retry once — transient startup failures + # (stale module cache, import race) often + # resolve on the second attempt. + print( + f" ⚠ {svc_name} died after restart, retrying..." + ) + retry = subprocess.run( + scope_cmd + ["restart", svc_name], + capture_output=True, + text=True, + timeout=15, + ) _time.sleep(3) - verify = subprocess.run( + verify2 = subprocess.run( scope_cmd + ["is-active", svc_name], capture_output=True, text=True, timeout=5, ) - if verify.stdout.strip() == "active": + if verify2.stdout.strip() == "active": restarted_services.append(svc_name) + print(f" ✓ {svc_name} recovered on retry") else: - # Retry once — transient startup failures - # (stale module cache, import race) often - # resolve on the second attempt. print( - f" ⚠ {svc_name} died after restart, retrying..." + f" ✗ {svc_name} failed to stay running after restart.\n" + f" Check logs: journalctl --user -u {svc_name} --since '2 min ago'\n" + f" Restart manually: systemctl {'--user ' if scope == 'user' else ''}restart {svc_name}" ) - retry = subprocess.run( - scope_cmd + ["restart", svc_name], - capture_output=True, - text=True, - timeout=15, - ) - _time.sleep(3) - verify2 = subprocess.run( - scope_cmd + ["is-active", svc_name], - capture_output=True, - text=True, - timeout=5, - ) - if verify2.stdout.strip() == "active": - restarted_services.append(svc_name) - print(f" ✓ {svc_name} recovered on retry") - else: - print( - f" ✗ {svc_name} failed to stay running after restart.\n" - f" Check logs: journalctl --user -u {svc_name} --since '2 min ago'\n" - f" Restart manually: systemctl {'--user ' if scope == 'user' else ''}restart {svc_name}" - ) - else: - print( - f" ⚠ Failed to restart {svc_name}: {restart.stderr.strip()}" - ) + else: + print( + f" ⚠ Failed to restart {svc_name}: {restart.stderr.strip()}" + ) except (FileNotFoundError, subprocess.TimeoutExpired): pass diff --git a/tests/hermes_cli/test_update_gateway_restart.py b/tests/hermes_cli/test_update_gateway_restart.py index 2a2bc962d..1c7e1b96c 100644 --- a/tests/hermes_cli/test_update_gateway_restart.py +++ b/tests/hermes_cli/test_update_gateway_restart.py @@ -422,6 +422,152 @@ class TestCmdUpdateLaunchdRestart: ] assert len(restart_calls) == 1 + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_update_prefers_sigusr1_over_systemctl_restart_when_mainpid_known( + self, mock_run, _mock_which, mock_args, capsys, monkeypatch, + ): + """Drain-aware update: when systemctl show reports a MainPID, the + update path sends SIGUSR1 and waits for graceful exit + respawn, + instead of ``systemctl restart`` (which SIGKILLs in-flight agents). + """ + monkeypatch.setattr(gateway_cli, "is_macos", lambda: False) + monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True) + monkeypatch.setattr(gateway_cli, "is_termux", lambda: False) + + # Track state: before kill → "active" (old PID), + # after kill + exit → briefly inactive, then "active" again (new PID). + state = {"killed": False} + + def side_effect(cmd, **kwargs): + joined = " ".join(str(c) for c in cmd) + + if "rev-parse" in joined and "--abbrev-ref" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="main\n", stderr="") + if "rev-parse" in joined and "--verify" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + if "rev-list" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="3\n", stderr="") + + # Only expose a user-scope service. + if "systemctl" in joined and "list-units" in joined: + if "--user" in joined: + return subprocess.CompletedProcess( + cmd, 0, + stdout="hermes-gateway.service loaded active running\n", + stderr="", + ) + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + if "systemctl" in joined and "is-active" in joined: + # Pre-kill: active. Post-kill: active again (respawned by + # Restart=on-failure). The drain loop verifies liveness + # separately via os.kill(pid, 0). + return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="") + + # The new code path. + if "systemctl" in joined and "show" in joined and "MainPID" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="") + + # If systemctl restart is called, this test fails its intent — + # but still let it succeed so we can assert it was NOT called. + if "systemctl" in joined and "restart" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + mock_run.side_effect = side_effect + + # Track SIGUSR1 delivery and simulate the gateway draining + exiting. + sigusr1_sent = {"value": False} + + def fake_kill(pid, sig): + import signal as _s + if pid == 4242 and sig == _s.SIGUSR1: + sigusr1_sent["value"] = True + state["killed"] = True + return + if pid == 4242 and sig == 0: + # Liveness probe — report dead once SIGUSR1 has been sent. + if state["killed"]: + raise ProcessLookupError() + return + # For any other PID/sig combination, succeed silently. + return + + monkeypatch.setattr("os.kill", fake_kill) + + with patch.object(gateway_cli, "find_gateway_pids", return_value=[]): + cmd_update(mock_args) + + # SIGUSR1 must have been delivered to the gateway MainPID. + assert sigusr1_sent["value"], "Expected SIGUSR1 to be sent to MainPID" + + # And `systemctl restart` must NOT have been used (that's the + # non-draining kill-everything path we're moving away from). + restart_calls = [ + c for c in mock_run.call_args_list + if "systemctl" in " ".join(str(a) for a in c.args[0]) + and "restart" in " ".join(str(a) for a in c.args[0]) + ] + assert restart_calls == [], ( + "Graceful SIGUSR1 succeeded; `systemctl restart` should not " + f"have been called. Got: {restart_calls}" + ) + + captured = capsys.readouterr().out + assert "draining" in captured.lower() + assert "Restarted hermes-gateway" in captured + + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_update_falls_back_to_systemctl_restart_when_sigusr1_times_out( + self, mock_run, _mock_which, mock_args, capsys, monkeypatch, + ): + """If the gateway doesn't exit within the drain budget (e.g. old unit + missing ``Restart=on-failure`` or an agent ignoring SIGUSR1), the + update path falls back to ``systemctl restart``. + """ + monkeypatch.setattr(gateway_cli, "is_macos", lambda: False) + monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True) + monkeypatch.setattr(gateway_cli, "is_termux", lambda: False) + + mock_run.side_effect = _make_run_side_effect( + commit_count="3", + systemd_active=True, + ) + + # Patch systemctl show to report MainPID=4242 so cmd_update attempts + # the graceful path. + orig = mock_run.side_effect + def wrapped(cmd, **kwargs): + joined = " ".join(str(c) for c in cmd) + if "systemctl" in joined and "show" in joined and "MainPID" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="") + return orig(cmd, **kwargs) + mock_run.side_effect = wrapped + + # Simulate the drain helper failing to confirm a clean exit — either + # because the gateway ignored SIGUSR1 or the drain budget was + # exceeded. cmd_update() should detect this and escalate. + monkeypatch.setattr( + "hermes_cli.gateway._graceful_restart_via_sigusr1", + lambda pid, drain_timeout: False, + ) + + with patch.object(gateway_cli, "find_gateway_pids", return_value=[]): + cmd_update(mock_args) + + # Fallback kicked in → systemctl restart was called. + restart_calls = [ + c for c in mock_run.call_args_list + if "systemctl" in " ".join(str(a) for a in c.args[0]) + and "restart" in " ".join(str(a) for a in c.args[0]) + ] + assert len(restart_calls) >= 1, ( + "Drain path failed; expected fallback `systemctl restart`." + ) + @patch("shutil.which", return_value=None) @patch("subprocess.run") def test_update_no_gateway_running_skips_restart(