mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(gateway): recover stale pid and planned restart state
This commit is contained in:
parent
284e084bcc
commit
b52123eb15
7 changed files with 646 additions and 79 deletions
|
|
@ -65,7 +65,11 @@ class TestGatewayPidState:
|
|||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
|
||||
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
|
||||
|
||||
assert status.get_running_pid() == os.getpid()
|
||||
assert status.acquire_gateway_runtime_lock() is True
|
||||
try:
|
||||
assert status.get_running_pid() == os.getpid()
|
||||
finally:
|
||||
status.release_gateway_runtime_lock()
|
||||
|
||||
def test_get_running_pid_accepts_script_style_gateway_cmdline(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
|
|
@ -85,7 +89,11 @@ class TestGatewayPidState:
|
|||
lambda pid: "/venv/bin/python /repo/hermes_cli/main.py gateway run --replace",
|
||||
)
|
||||
|
||||
assert status.get_running_pid() == os.getpid()
|
||||
assert status.acquire_gateway_runtime_lock() is True
|
||||
try:
|
||||
assert status.get_running_pid() == os.getpid()
|
||||
finally:
|
||||
status.release_gateway_runtime_lock()
|
||||
|
||||
def test_get_running_pid_accepts_explicit_pid_path_without_cleanup(self, tmp_path, monkeypatch):
|
||||
other_home = tmp_path / "profile-home"
|
||||
|
|
@ -102,9 +110,82 @@ class TestGatewayPidState:
|
|||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
|
||||
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
|
||||
|
||||
lock_path = other_home / "gateway.lock"
|
||||
lock_path.write_text(json.dumps({
|
||||
"pid": os.getpid(),
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["python", "-m", "hermes_cli.main", "gateway"],
|
||||
"start_time": 123,
|
||||
}))
|
||||
monkeypatch.setattr(status, "is_gateway_runtime_lock_active", lambda lock_path=None: True)
|
||||
|
||||
assert status.get_running_pid(pid_path, cleanup_stale=False) == os.getpid()
|
||||
assert pid_path.exists()
|
||||
|
||||
def test_runtime_lock_claims_and_releases_liveness(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
|
||||
assert status.is_gateway_runtime_lock_active() is False
|
||||
assert status.acquire_gateway_runtime_lock() is True
|
||||
assert status.is_gateway_runtime_lock_active() is True
|
||||
|
||||
status.release_gateway_runtime_lock()
|
||||
|
||||
assert status.is_gateway_runtime_lock_active() is False
|
||||
|
||||
def test_get_running_pid_treats_pid_file_as_stale_without_runtime_lock(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
pid_path = tmp_path / "gateway.pid"
|
||||
pid_path.write_text(json.dumps({
|
||||
"pid": os.getpid(),
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["python", "-m", "hermes_cli.main", "gateway"],
|
||||
"start_time": 123,
|
||||
}))
|
||||
|
||||
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
|
||||
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
|
||||
|
||||
assert status.get_running_pid() is None
|
||||
assert not pid_path.exists()
|
||||
|
||||
def test_get_running_pid_falls_back_to_live_lock_record(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
pid_path = tmp_path / "gateway.pid"
|
||||
pid_path.write_text(json.dumps({
|
||||
"pid": 99999,
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["python", "-m", "hermes_cli.main", "gateway"],
|
||||
"start_time": 123,
|
||||
}))
|
||||
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
|
||||
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
|
||||
monkeypatch.setattr(
|
||||
status,
|
||||
"_build_pid_record",
|
||||
lambda: {
|
||||
"pid": os.getpid(),
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["python", "-m", "hermes_cli.main", "gateway"],
|
||||
"start_time": 123,
|
||||
},
|
||||
)
|
||||
assert status.acquire_gateway_runtime_lock() is True
|
||||
|
||||
def fake_kill(pid, sig):
|
||||
if pid == 99999:
|
||||
raise ProcessLookupError
|
||||
return None
|
||||
|
||||
monkeypatch.setattr(status.os, "kill", fake_kill)
|
||||
|
||||
try:
|
||||
assert status.get_running_pid() == os.getpid()
|
||||
finally:
|
||||
status.release_gateway_runtime_lock()
|
||||
|
||||
|
||||
class TestGatewayRuntimeStatus:
|
||||
def test_write_runtime_status_overwrites_stale_pid_on_restart(self, tmp_path, monkeypatch):
|
||||
|
|
|
|||
|
|
@ -121,6 +121,12 @@ def test_systemd_status_warns_when_linger_disabled(monkeypatch, tmp_path, capsys
|
|||
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||
if cmd[:3] == ["systemctl", "--user", "is-active"]:
|
||||
return SimpleNamespace(returncode=0, stdout="active\n", stderr="")
|
||||
if cmd[:3] == ["systemctl", "--user", "show"]:
|
||||
return SimpleNamespace(
|
||||
returncode=0,
|
||||
stdout="ActiveState=active\nSubState=running\nResult=success\nExecMainStatus=0\n",
|
||||
stderr="",
|
||||
)
|
||||
raise AssertionError(f"Unexpected command: {cmd}")
|
||||
|
||||
monkeypatch.setattr(gateway.subprocess, "run", fake_run)
|
||||
|
|
@ -352,3 +358,24 @@ class TestWaitForGatewayExit:
|
|||
|
||||
assert killed == 2
|
||||
assert calls == [(11, True), (22, True)]
|
||||
|
||||
|
||||
class TestStopProfileGateway:
|
||||
def test_stop_profile_gateway_keeps_pid_file_when_process_still_running(self, monkeypatch):
|
||||
calls = {"kill": 0, "remove": 0}
|
||||
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 12345)
|
||||
monkeypatch.setattr(
|
||||
gateway.os,
|
||||
"kill",
|
||||
lambda pid, sig: calls.__setitem__("kill", calls["kill"] + 1),
|
||||
)
|
||||
monkeypatch.setattr("time.sleep", lambda _: None)
|
||||
monkeypatch.setattr(
|
||||
"gateway.status.remove_pid_file",
|
||||
lambda: calls.__setitem__("remove", calls["remove"] + 1),
|
||||
)
|
||||
|
||||
assert gateway.stop_profile_gateway() is True
|
||||
assert calls["kill"] == 21
|
||||
assert calls["remove"] == 0
|
||||
|
|
|
|||
|
|
@ -77,8 +77,10 @@ class TestSystemdServiceRefresh:
|
|||
gateway_cli.systemd_restart()
|
||||
|
||||
assert unit_path.read_text(encoding="utf-8") == "new unit\n"
|
||||
assert calls[:2] == [
|
||||
assert calls[:4] == [
|
||||
["systemctl", "--user", "daemon-reload"],
|
||||
["systemctl", "--user", "show", gateway_cli.get_service_name(), "--no-pager", "--property", "ActiveState,SubState,Result,ExecMainStatus"],
|
||||
["systemctl", "--user", "reset-failed", gateway_cli.get_service_name()],
|
||||
["systemctl", "--user", "reload-or-restart", gateway_cli.get_service_name()],
|
||||
]
|
||||
|
||||
|
|
@ -474,13 +476,21 @@ class TestGatewaySystemServiceRouting:
|
|||
raise ProcessLookupError()
|
||||
monkeypatch.setattr(os, "kill", fake_kill)
|
||||
|
||||
# Simulate systemctl is-active returning "active" with a new PID
|
||||
# Simulate systemctl reset-failed/start followed by an active unit
|
||||
new_pid = [None]
|
||||
def fake_subprocess_run(cmd, **kwargs):
|
||||
if "is-active" in cmd:
|
||||
result = SimpleNamespace(stdout="active\n", returncode=0)
|
||||
new_pid[0] = 999 # new PID
|
||||
return result
|
||||
if "reset-failed" in cmd:
|
||||
calls.append(("reset-failed", cmd))
|
||||
return SimpleNamespace(stdout="", returncode=0)
|
||||
if "start" in cmd:
|
||||
calls.append(("start", cmd))
|
||||
return SimpleNamespace(stdout="", returncode=0)
|
||||
if "show" in cmd:
|
||||
new_pid[0] = 999
|
||||
return SimpleNamespace(
|
||||
stdout="ActiveState=active\nSubState=running\nResult=success\nExecMainStatus=0\n",
|
||||
returncode=0,
|
||||
)
|
||||
raise AssertionError(f"Unexpected systemctl call: {cmd}")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_subprocess_run)
|
||||
|
|
@ -494,9 +504,131 @@ class TestGatewaySystemServiceRouting:
|
|||
gateway_cli.systemd_restart()
|
||||
|
||||
assert ("self", 654) in calls
|
||||
assert any(call[0] == "reset-failed" for call in calls)
|
||||
assert any(call[0] == "start" for call in calls)
|
||||
out = capsys.readouterr().out.lower()
|
||||
assert "restarted" in out
|
||||
|
||||
def test_systemd_restart_recovers_failed_planned_restart(self, monkeypatch, capsys):
|
||||
monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False)
|
||||
monkeypatch.setattr(gateway_cli, "refresh_systemd_unit_if_needed", lambda system=False: None)
|
||||
monkeypatch.setattr(
|
||||
"gateway.status.read_runtime_status",
|
||||
lambda: {"restart_requested": True, "gateway_state": "stopped"},
|
||||
)
|
||||
monkeypatch.setattr(gateway_cli, "_request_gateway_self_restart", lambda pid: False)
|
||||
|
||||
calls = []
|
||||
started = {"value": False}
|
||||
|
||||
def fake_subprocess_run(cmd, **kwargs):
|
||||
if "show" in cmd:
|
||||
if not started["value"]:
|
||||
return SimpleNamespace(
|
||||
stdout=(
|
||||
"ActiveState=failed\n"
|
||||
"SubState=failed\n"
|
||||
"Result=exit-code\n"
|
||||
f"ExecMainStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}\n"
|
||||
),
|
||||
returncode=0,
|
||||
)
|
||||
return SimpleNamespace(
|
||||
stdout="ActiveState=active\nSubState=running\nResult=success\nExecMainStatus=0\n",
|
||||
returncode=0,
|
||||
)
|
||||
if "reset-failed" in cmd:
|
||||
calls.append(("reset-failed", cmd))
|
||||
return SimpleNamespace(stdout="", returncode=0)
|
||||
if "start" in cmd:
|
||||
started["value"] = True
|
||||
calls.append(("start", cmd))
|
||||
return SimpleNamespace(stdout="", returncode=0)
|
||||
raise AssertionError(f"Unexpected command: {cmd}")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_subprocess_run)
|
||||
monkeypatch.setattr(
|
||||
"gateway.status.get_running_pid",
|
||||
lambda: 999 if started["value"] else None,
|
||||
)
|
||||
|
||||
gateway_cli.systemd_restart()
|
||||
|
||||
assert any(call[0] == "reset-failed" for call in calls)
|
||||
assert any(call[0] == "start" for call in calls)
|
||||
out = capsys.readouterr().out.lower()
|
||||
assert "restarted" in out
|
||||
|
||||
def test_systemd_status_surfaces_planned_restart_failure(self, monkeypatch, capsys):
|
||||
unit = SimpleNamespace(exists=lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False)
|
||||
monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda system=False: unit)
|
||||
monkeypatch.setattr(gateway_cli, "has_conflicting_systemd_units", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "has_legacy_hermes_units", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "systemd_unit_is_current", lambda system=False: True)
|
||||
monkeypatch.setattr(gateway_cli, "_runtime_health_lines", lambda: ["⚠ Last shutdown reason: Gateway restart requested"])
|
||||
monkeypatch.setattr(gateway_cli, "get_systemd_linger_status", lambda: (True, ""))
|
||||
monkeypatch.setattr(gateway_cli, "_read_systemd_unit_properties", lambda system=False: {
|
||||
"ActiveState": "failed",
|
||||
"SubState": "failed",
|
||||
"Result": "exit-code",
|
||||
"ExecMainStatus": str(GATEWAY_SERVICE_RESTART_EXIT_CODE),
|
||||
})
|
||||
|
||||
calls = []
|
||||
|
||||
def fake_run_systemctl(args, **kwargs):
|
||||
calls.append(args)
|
||||
if args[:2] == ["status", gateway_cli.get_service_name()]:
|
||||
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||
if args[:2] == ["is-active", gateway_cli.get_service_name()]:
|
||||
return SimpleNamespace(returncode=3, stdout="failed\n", stderr="")
|
||||
raise AssertionError(f"Unexpected args: {args}")
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "_run_systemctl", fake_run_systemctl)
|
||||
|
||||
gateway_cli.systemd_status()
|
||||
|
||||
out = capsys.readouterr().out
|
||||
assert "Planned restart is stuck in systemd failed state" in out
|
||||
|
||||
def test_gateway_status_dispatches_full_flag(self, monkeypatch):
|
||||
user_unit = SimpleNamespace(exists=lambda: True)
|
||||
system_unit = SimpleNamespace(exists=lambda: False)
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"get_systemd_unit_path",
|
||||
lambda system=False: system_unit if system else user_unit,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"get_gateway_runtime_snapshot",
|
||||
lambda system=False: gateway_cli.GatewayRuntimeSnapshot(
|
||||
manager="systemd (user)",
|
||||
service_installed=True,
|
||||
service_running=False,
|
||||
gateway_pids=(),
|
||||
service_scope="user",
|
||||
),
|
||||
)
|
||||
|
||||
calls = []
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"systemd_status",
|
||||
lambda deep=False, system=False, full=False: calls.append((deep, system, full)),
|
||||
)
|
||||
|
||||
gateway_cli.gateway_command(
|
||||
SimpleNamespace(gateway_command="status", deep=False, system=False, full=True)
|
||||
)
|
||||
|
||||
assert calls == [(False, False, True)]
|
||||
|
||||
def test_gateway_install_passes_system_flags(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
|
||||
|
|
@ -547,11 +679,15 @@ class TestGatewaySystemServiceRouting:
|
|||
)
|
||||
|
||||
calls = []
|
||||
monkeypatch.setattr(gateway_cli, "systemd_status", lambda deep=False, system=False: calls.append((deep, system)))
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"systemd_status",
|
||||
lambda deep=False, system=False, full=False: calls.append((deep, system, full)),
|
||||
)
|
||||
|
||||
gateway_cli.gateway_command(SimpleNamespace(gateway_command="status", deep=False, system=False))
|
||||
|
||||
assert calls == [(False, False)]
|
||||
assert calls == [(False, False, False)]
|
||||
|
||||
def test_gateway_status_reports_manual_process_when_service_is_stopped(self, monkeypatch, capsys):
|
||||
user_unit = SimpleNamespace(exists=lambda: True)
|
||||
|
|
@ -565,7 +701,11 @@ class TestGatewaySystemServiceRouting:
|
|||
"get_systemd_unit_path",
|
||||
lambda system=False: system_unit if system else user_unit,
|
||||
)
|
||||
monkeypatch.setattr(gateway_cli, "systemd_status", lambda deep=False, system=False: print("service stopped"))
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"systemd_status",
|
||||
lambda deep=False, system=False, full=False: print("service stopped"),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"get_gateway_runtime_snapshot",
|
||||
|
|
@ -1570,6 +1710,23 @@ class TestMigrateLegacyCommand:
|
|||
|
||||
assert called == {"interactive": False, "dry_run": False}
|
||||
|
||||
|
||||
class TestGatewayStatusParser:
|
||||
def test_gateway_status_subparser_accepts_full_flag(self):
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, "-m", "hermes_cli.main", "gateway", "status", "-l", "--help"],
|
||||
cwd=str(gateway_cli.PROJECT_ROOT),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=15,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert "unrecognized arguments" not in result.stderr
|
||||
|
||||
def test_gateway_command_migrate_legacy_dry_run_passes_through(
|
||||
self, monkeypatch
|
||||
):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue