fix(gateway): recover stale pid and planned restart state

This commit is contained in:
helix4u 2026-04-22 17:17:16 -06:00 committed by Teknium
parent 284e084bcc
commit b52123eb15
7 changed files with 646 additions and 79 deletions

View file

@ -77,8 +77,10 @@ class TestSystemdServiceRefresh:
gateway_cli.systemd_restart()
assert unit_path.read_text(encoding="utf-8") == "new unit\n"
assert calls[:2] == [
assert calls[:4] == [
["systemctl", "--user", "daemon-reload"],
["systemctl", "--user", "show", gateway_cli.get_service_name(), "--no-pager", "--property", "ActiveState,SubState,Result,ExecMainStatus"],
["systemctl", "--user", "reset-failed", gateway_cli.get_service_name()],
["systemctl", "--user", "reload-or-restart", gateway_cli.get_service_name()],
]
@ -474,13 +476,21 @@ class TestGatewaySystemServiceRouting:
raise ProcessLookupError()
monkeypatch.setattr(os, "kill", fake_kill)
# Simulate systemctl is-active returning "active" with a new PID
# Simulate systemctl reset-failed/start followed by an active unit
new_pid = [None]
def fake_subprocess_run(cmd, **kwargs):
if "is-active" in cmd:
result = SimpleNamespace(stdout="active\n", returncode=0)
new_pid[0] = 999 # new PID
return result
if "reset-failed" in cmd:
calls.append(("reset-failed", cmd))
return SimpleNamespace(stdout="", returncode=0)
if "start" in cmd:
calls.append(("start", cmd))
return SimpleNamespace(stdout="", returncode=0)
if "show" in cmd:
new_pid[0] = 999
return SimpleNamespace(
stdout="ActiveState=active\nSubState=running\nResult=success\nExecMainStatus=0\n",
returncode=0,
)
raise AssertionError(f"Unexpected systemctl call: {cmd}")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_subprocess_run)
@ -494,9 +504,131 @@ class TestGatewaySystemServiceRouting:
gateway_cli.systemd_restart()
assert ("self", 654) in calls
assert any(call[0] == "reset-failed" for call in calls)
assert any(call[0] == "start" for call in calls)
out = capsys.readouterr().out.lower()
assert "restarted" in out
def test_systemd_restart_recovers_failed_planned_restart(self, monkeypatch, capsys):
monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False)
monkeypatch.setattr(gateway_cli, "refresh_systemd_unit_if_needed", lambda system=False: None)
monkeypatch.setattr(
"gateway.status.read_runtime_status",
lambda: {"restart_requested": True, "gateway_state": "stopped"},
)
monkeypatch.setattr(gateway_cli, "_request_gateway_self_restart", lambda pid: False)
calls = []
started = {"value": False}
def fake_subprocess_run(cmd, **kwargs):
if "show" in cmd:
if not started["value"]:
return SimpleNamespace(
stdout=(
"ActiveState=failed\n"
"SubState=failed\n"
"Result=exit-code\n"
f"ExecMainStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}\n"
),
returncode=0,
)
return SimpleNamespace(
stdout="ActiveState=active\nSubState=running\nResult=success\nExecMainStatus=0\n",
returncode=0,
)
if "reset-failed" in cmd:
calls.append(("reset-failed", cmd))
return SimpleNamespace(stdout="", returncode=0)
if "start" in cmd:
started["value"] = True
calls.append(("start", cmd))
return SimpleNamespace(stdout="", returncode=0)
raise AssertionError(f"Unexpected command: {cmd}")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_subprocess_run)
monkeypatch.setattr(
"gateway.status.get_running_pid",
lambda: 999 if started["value"] else None,
)
gateway_cli.systemd_restart()
assert any(call[0] == "reset-failed" for call in calls)
assert any(call[0] == "start" for call in calls)
out = capsys.readouterr().out.lower()
assert "restarted" in out
def test_systemd_status_surfaces_planned_restart_failure(self, monkeypatch, capsys):
unit = SimpleNamespace(exists=lambda: True)
monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False)
monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda system=False: unit)
monkeypatch.setattr(gateway_cli, "has_conflicting_systemd_units", lambda: False)
monkeypatch.setattr(gateway_cli, "has_legacy_hermes_units", lambda: False)
monkeypatch.setattr(gateway_cli, "systemd_unit_is_current", lambda system=False: True)
monkeypatch.setattr(gateway_cli, "_runtime_health_lines", lambda: ["⚠ Last shutdown reason: Gateway restart requested"])
monkeypatch.setattr(gateway_cli, "get_systemd_linger_status", lambda: (True, ""))
monkeypatch.setattr(gateway_cli, "_read_systemd_unit_properties", lambda system=False: {
"ActiveState": "failed",
"SubState": "failed",
"Result": "exit-code",
"ExecMainStatus": str(GATEWAY_SERVICE_RESTART_EXIT_CODE),
})
calls = []
def fake_run_systemctl(args, **kwargs):
calls.append(args)
if args[:2] == ["status", gateway_cli.get_service_name()]:
return SimpleNamespace(returncode=0, stdout="", stderr="")
if args[:2] == ["is-active", gateway_cli.get_service_name()]:
return SimpleNamespace(returncode=3, stdout="failed\n", stderr="")
raise AssertionError(f"Unexpected args: {args}")
monkeypatch.setattr(gateway_cli, "_run_systemctl", fake_run_systemctl)
gateway_cli.systemd_status()
out = capsys.readouterr().out
assert "Planned restart is stuck in systemd failed state" in out
def test_gateway_status_dispatches_full_flag(self, monkeypatch):
user_unit = SimpleNamespace(exists=lambda: True)
system_unit = SimpleNamespace(exists=lambda: False)
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(
gateway_cli,
"get_systemd_unit_path",
lambda system=False: system_unit if system else user_unit,
)
monkeypatch.setattr(
gateway_cli,
"get_gateway_runtime_snapshot",
lambda system=False: gateway_cli.GatewayRuntimeSnapshot(
manager="systemd (user)",
service_installed=True,
service_running=False,
gateway_pids=(),
service_scope="user",
),
)
calls = []
monkeypatch.setattr(
gateway_cli,
"systemd_status",
lambda deep=False, system=False, full=False: calls.append((deep, system, full)),
)
gateway_cli.gateway_command(
SimpleNamespace(gateway_command="status", deep=False, system=False, full=True)
)
assert calls == [(False, False, True)]
def test_gateway_install_passes_system_flags(self, monkeypatch):
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
@ -547,11 +679,15 @@ class TestGatewaySystemServiceRouting:
)
calls = []
monkeypatch.setattr(gateway_cli, "systemd_status", lambda deep=False, system=False: calls.append((deep, system)))
monkeypatch.setattr(
gateway_cli,
"systemd_status",
lambda deep=False, system=False, full=False: calls.append((deep, system, full)),
)
gateway_cli.gateway_command(SimpleNamespace(gateway_command="status", deep=False, system=False))
assert calls == [(False, False)]
assert calls == [(False, False, False)]
def test_gateway_status_reports_manual_process_when_service_is_stopped(self, monkeypatch, capsys):
user_unit = SimpleNamespace(exists=lambda: True)
@ -565,7 +701,11 @@ class TestGatewaySystemServiceRouting:
"get_systemd_unit_path",
lambda system=False: system_unit if system else user_unit,
)
monkeypatch.setattr(gateway_cli, "systemd_status", lambda deep=False, system=False: print("service stopped"))
monkeypatch.setattr(
gateway_cli,
"systemd_status",
lambda deep=False, system=False, full=False: print("service stopped"),
)
monkeypatch.setattr(
gateway_cli,
"get_gateway_runtime_snapshot",
@ -1570,6 +1710,23 @@ class TestMigrateLegacyCommand:
assert called == {"interactive": False, "dry_run": False}
class TestGatewayStatusParser:
def test_gateway_status_subparser_accepts_full_flag(self):
import subprocess
import sys
result = subprocess.run(
[sys.executable, "-m", "hermes_cli.main", "gateway", "status", "-l", "--help"],
cwd=str(gateway_cli.PROJECT_ROOT),
capture_output=True,
text=True,
timeout=15,
)
assert result.returncode == 0
assert "unrecognized arguments" not in result.stderr
def test_gateway_command_migrate_legacy_dry_run_passes_through(
self, monkeypatch
):