mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: hermes gateway restart waits for service to come back up (#8260)
Previously, systemd_restart() sent SIGUSR1 to the gateway, printed 'restart requested', and returned immediately. The gateway still needed to drain active agents, exit with code 75, wait for systemd's RestartSec=30, and start the new process. The user saw 'success' but the gateway was actually down for 30-60 seconds. Now the SIGUSR1 path blocks with progress feedback: Phase 1 — wait for old process to die: ⏳ User service draining active work... Polls os.kill(pid, 0) until ProcessLookupError (up to 90s) Phase 2 — wait for new process to become active: ⏳ Waiting for hermes-gateway to restart... Polls systemctl is-active + verifies new PID (up to 60s) Success: ✓ User service restarted (PID 12345) Timeout: ⚠ User service did not become active within 60s. Check status: hermes gateway status Check logs: journalctl --user -u hermes-gateway --since '2 min ago' The reload-or-restart fallback path (line 1189) already blocks because systemctl reload-or-restart is synchronous. Test plan: - Updated test to verify wait-for-restart behavior - All 118 gateway CLI tests pass
This commit is contained in:
parent
6c89306437
commit
2a98098035
2 changed files with 85 additions and 9 deletions
|
|
@ -1128,7 +1128,62 @@ def systemd_restart(system: bool = False):
|
|||
|
||||
pid = get_running_pid()
|
||||
if pid is not None and _request_gateway_self_restart(pid):
|
||||
print(f"✓ {_service_scope_label(system).capitalize()} service restart requested")
|
||||
# SIGUSR1 sent — the gateway will drain active agents, exit with
|
||||
# code 75, and systemd will restart it after RestartSec (30s).
|
||||
# Wait for the old process to die and the new one to become active
|
||||
# so the CLI doesn't return while the service is still restarting.
|
||||
import time
|
||||
scope_label = _service_scope_label(system).capitalize()
|
||||
svc = get_service_name()
|
||||
scope_cmd = _systemctl_cmd(system)
|
||||
|
||||
# Phase 1: wait for old process to exit (drain + shutdown)
|
||||
print(f"⏳ {scope_label} service draining active work...")
|
||||
deadline = time.time() + 90
|
||||
while time.time() < deadline:
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
time.sleep(1)
|
||||
except (ProcessLookupError, PermissionError):
|
||||
break # old process is gone
|
||||
else:
|
||||
print(f"⚠ Old process (PID {pid}) still alive after 90s")
|
||||
|
||||
# Phase 2: wait for systemd to start the new process
|
||||
print(f"⏳ Waiting for {svc} to restart...")
|
||||
deadline = time.time() + 60
|
||||
while time.time() < deadline:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
scope_cmd + ["is-active", svc],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
if result.stdout.strip() == "active":
|
||||
# Verify it's a NEW process, not the old one somehow
|
||||
new_pid = get_running_pid()
|
||||
if new_pid and new_pid != pid:
|
||||
print(f"✓ {scope_label} service restarted (PID {new_pid})")
|
||||
return
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError):
|
||||
pass
|
||||
time.sleep(2)
|
||||
|
||||
# Timed out — check final state
|
||||
try:
|
||||
result = subprocess.run(
|
||||
scope_cmd + ["is-active", svc],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
if result.stdout.strip() == "active":
|
||||
print(f"✓ {scope_label} service restarted")
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
print(
|
||||
f"⚠ {scope_label} service did not become active within 60s.\n"
|
||||
f" Check status: {'sudo ' if system else ''}hermes gateway status\n"
|
||||
f" Check logs: journalctl {'--user ' if not system else ''}-u {svc} --since '2 min ago'"
|
||||
)
|
||||
return
|
||||
_run_systemctl(["reload-or-restart", get_service_name()], system=system, check=True, timeout=90)
|
||||
print(f"✓ {_service_scope_label(system).capitalize()} service restarted")
|
||||
|
|
|
|||
|
|
@ -452,7 +452,7 @@ class TestGatewayServiceDetection:
|
|||
|
||||
|
||||
class TestGatewaySystemServiceRouting:
|
||||
def test_systemd_restart_self_requests_graceful_restart_without_reload_or_restart(self, monkeypatch, capsys):
|
||||
def test_systemd_restart_self_requests_graceful_restart_and_waits(self, monkeypatch, capsys):
|
||||
calls = []
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False)
|
||||
|
|
@ -466,16 +466,37 @@ class TestGatewaySystemServiceRouting:
|
|||
"_request_gateway_self_restart",
|
||||
lambda pid: calls.append(("self", pid)) or True,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
gateway_cli.subprocess,
|
||||
"run",
|
||||
lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("systemctl should not run")),
|
||||
)
|
||||
|
||||
# Simulate: old process dies immediately, new process becomes active
|
||||
kill_call_count = [0]
|
||||
def fake_kill(pid, sig):
|
||||
kill_call_count[0] += 1
|
||||
if kill_call_count[0] >= 2: # first call checks, second = dead
|
||||
raise ProcessLookupError()
|
||||
monkeypatch.setattr(os, "kill", fake_kill)
|
||||
|
||||
# Simulate systemctl is-active returning "active" with a new PID
|
||||
new_pid = [None]
|
||||
def fake_subprocess_run(cmd, **kwargs):
|
||||
if "is-active" in cmd:
|
||||
result = SimpleNamespace(stdout="active\n", returncode=0)
|
||||
new_pid[0] = 999 # new PID
|
||||
return result
|
||||
raise AssertionError(f"Unexpected systemctl call: {cmd}")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_subprocess_run)
|
||||
# get_running_pid returns new PID after restart
|
||||
pid_calls = [0]
|
||||
def fake_get_pid():
|
||||
pid_calls[0] += 1
|
||||
return 999 if pid_calls[0] > 1 else 654
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", fake_get_pid)
|
||||
|
||||
gateway_cli.systemd_restart()
|
||||
|
||||
assert calls == [("refresh", False), ("self", 654)]
|
||||
assert "restart requested" in capsys.readouterr().out.lower()
|
||||
assert ("self", 654) in calls
|
||||
out = capsys.readouterr().out.lower()
|
||||
assert "restarted" in out
|
||||
|
||||
def test_gateway_install_passes_system_flags(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue