hermes-agent/tests/gateway/test_status.py
Teknium 07db20c72d
fix(gateway): detect legacy hermes.service + mark --replace SIGTERM as planned (#11909)
* fix(gateway): detect legacy hermes.service units from pre-rename installs

Older Hermes installs used a different service name (hermes.service) before
the rename to hermes-gateway.service. When both units remain installed, they
fight over the same bot token — after PR #5646's signal-recovery change,
this manifests as a 30-second SIGTERM flap loop between the two services.

Detection is an explicit allowlist (no globbing) plus an ExecStart content
check, so profile units (hermes-gateway-<profile>.service) and unrelated
third-party services named 'hermes' are never matched.

Wired into systemd_install, systemd_status, gateway_setup wizard, and the
main hermes setup flow — anywhere we already warn about scope conflicts now
also warns about legacy units.

* feat(gateway): add migrate-legacy command + install-time removal prompt

- New hermes_cli.gateway.remove_legacy_hermes_units() removes legacy
  unit files with stop → disable → unlink → daemon-reload. Handles user
  and system scopes separately; system scope returns path list when not
  running as root so the caller can tell the user to re-run with sudo.
- New 'hermes gateway migrate-legacy' subcommand (with --dry-run and -y)
  routes to remove_legacy_hermes_units via gateway_command dispatch.
- systemd_install now offers to remove legacy units BEFORE installing
  the new hermes-gateway.service, preventing the SIGTERM flap loop that
  hits users who still have pre-rename hermes.service around.

Profile units (hermes-gateway-<profile>.service) remain untouched in
all paths — the legacy allowlist is explicit (_LEGACY_SERVICE_NAMES)
and the ExecStart content check further narrows matches.

* fix(gateway): mark --replace SIGTERM as planned so target exits 0

PR #5646 made SIGTERM exit the gateway with code 1 so systemd's
Restart=on-failure revives it after unexpected kills. But when a user has
two gateway units fighting for the same bot token (e.g. legacy
hermes.service + hermes-gateway.service from a pre-rename install), the
--replace takeover itself becomes the 'unexpected' SIGTERM — the loser
exits 1, systemd revives it 30s later, and the cycle flaps indefinitely.

Before calling terminate_pid(), --replace now writes a short-lived marker
file naming the target PID + start_time. The target's shutdown_signal_handler
consumes the marker and, when it names this process, leaves
_signal_initiated_shutdown=False so the final exit code stays 0.

Staleness defences:
- PID + start_time combo prevents PID reuse matching an old marker
- Marker older than 60s is treated as stale and discarded
- Marker is unlinked on first read even if it doesn't match this process
- Replacer clears the marker post-loop + on permission-denied give-up
2026-04-17 19:27:58 -07:00

444 lines
18 KiB
Python

"""Tests for gateway runtime status tracking."""
import json
import os
from types import SimpleNamespace
from gateway import status
class TestGatewayPidState:
def test_write_pid_file_records_gateway_metadata(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
status.write_pid_file()
payload = json.loads((tmp_path / "gateway.pid").read_text())
assert payload["pid"] == os.getpid()
assert payload["kind"] == "hermes-gateway"
assert isinstance(payload["argv"], list)
assert payload["argv"]
def test_get_running_pid_rejects_live_non_gateway_pid(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
pid_path = tmp_path / "gateway.pid"
pid_path.write_text(str(os.getpid()))
assert status.get_running_pid() is None
assert not pid_path.exists()
def test_get_running_pid_accepts_gateway_metadata_when_cmdline_unavailable(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
pid_path = tmp_path / "gateway.pid"
pid_path.write_text(json.dumps({
"pid": os.getpid(),
"kind": "hermes-gateway",
"argv": ["python", "-m", "hermes_cli.main", "gateway"],
"start_time": 123,
}))
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
assert status.get_running_pid() == os.getpid()
def test_get_running_pid_accepts_script_style_gateway_cmdline(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
pid_path = tmp_path / "gateway.pid"
pid_path.write_text(json.dumps({
"pid": os.getpid(),
"kind": "hermes-gateway",
"argv": ["/venv/bin/python", "/repo/hermes_cli/main.py", "gateway", "run", "--replace"],
"start_time": 123,
}))
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
monkeypatch.setattr(
status,
"_read_process_cmdline",
lambda pid: "/venv/bin/python /repo/hermes_cli/main.py gateway run --replace",
)
assert status.get_running_pid() == os.getpid()
def test_get_running_pid_accepts_explicit_pid_path_without_cleanup(self, tmp_path, monkeypatch):
other_home = tmp_path / "profile-home"
other_home.mkdir()
pid_path = other_home / "gateway.pid"
pid_path.write_text(json.dumps({
"pid": os.getpid(),
"kind": "hermes-gateway",
"argv": ["python", "-m", "hermes_cli.main", "gateway"],
"start_time": 123,
}))
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
assert status.get_running_pid(pid_path, cleanup_stale=False) == os.getpid()
assert pid_path.exists()
class TestGatewayRuntimeStatus:
def test_write_runtime_status_overwrites_stale_pid_on_restart(self, tmp_path, monkeypatch):
"""Regression: setdefault() preserved stale PID from previous process (#1631)."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# Simulate a previous gateway run that left a state file with a stale PID
state_path = tmp_path / "gateway_state.json"
state_path.write_text(json.dumps({
"pid": 99999,
"start_time": 1000.0,
"kind": "hermes-gateway",
"platforms": {},
"updated_at": "2025-01-01T00:00:00Z",
}))
status.write_runtime_status(gateway_state="running")
payload = status.read_runtime_status()
assert payload["pid"] == os.getpid(), "PID should be overwritten, not preserved via setdefault"
assert payload["start_time"] != 1000.0, "start_time should be overwritten on restart"
def test_write_runtime_status_records_platform_failure(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
status.write_runtime_status(
gateway_state="startup_failed",
exit_reason="telegram conflict",
platform="telegram",
platform_state="fatal",
error_code="telegram_polling_conflict",
error_message="another poller is active",
)
payload = status.read_runtime_status()
assert payload["gateway_state"] == "startup_failed"
assert payload["exit_reason"] == "telegram conflict"
assert payload["platforms"]["telegram"]["state"] == "fatal"
assert payload["platforms"]["telegram"]["error_code"] == "telegram_polling_conflict"
assert payload["platforms"]["telegram"]["error_message"] == "another poller is active"
def test_write_runtime_status_explicit_none_clears_stale_fields(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
status.write_runtime_status(
gateway_state="startup_failed",
exit_reason="stale error",
platform="discord",
platform_state="fatal",
error_code="discord_timeout",
error_message="stale platform error",
)
status.write_runtime_status(
gateway_state="running",
exit_reason=None,
platform="discord",
platform_state="connected",
error_code=None,
error_message=None,
)
payload = status.read_runtime_status()
assert payload["gateway_state"] == "running"
assert payload["exit_reason"] is None
assert payload["platforms"]["discord"]["state"] == "connected"
assert payload["platforms"]["discord"]["error_code"] is None
assert payload["platforms"]["discord"]["error_message"] is None
class TestTerminatePid:
def test_force_uses_taskkill_on_windows(self, monkeypatch):
calls = []
monkeypatch.setattr(status, "_IS_WINDOWS", True)
def fake_run(cmd, capture_output=False, text=False, timeout=None):
calls.append((cmd, capture_output, text, timeout))
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(status.subprocess, "run", fake_run)
status.terminate_pid(123, force=True)
assert calls == [
(["taskkill", "/PID", "123", "/T", "/F"], True, True, 10)
]
def test_force_falls_back_to_sigterm_when_taskkill_missing(self, monkeypatch):
calls = []
monkeypatch.setattr(status, "_IS_WINDOWS", True)
def fake_run(*args, **kwargs):
raise FileNotFoundError
def fake_kill(pid, sig):
calls.append((pid, sig))
monkeypatch.setattr(status.subprocess, "run", fake_run)
monkeypatch.setattr(status.os, "kill", fake_kill)
status.terminate_pid(456, force=True)
assert calls == [(456, status.signal.SIGTERM)]
class TestScopedLocks:
def test_acquire_scoped_lock_rejects_live_other_process(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text(json.dumps({
"pid": 99999,
"start_time": 123,
"kind": "hermes-gateway",
}))
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
assert acquired is False
assert existing["pid"] == 99999
def test_acquire_scoped_lock_replaces_stale_record(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text(json.dumps({
"pid": 99999,
"start_time": 123,
"kind": "hermes-gateway",
}))
def fake_kill(pid, sig):
raise ProcessLookupError
monkeypatch.setattr(status.os, "kill", fake_kill)
acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
assert acquired is True
payload = json.loads(lock_path.read_text())
assert payload["pid"] == os.getpid()
assert payload["metadata"]["platform"] == "telegram"
def test_acquire_scoped_lock_recovers_empty_lock_file(self, tmp_path, monkeypatch):
"""Empty lock file (0 bytes) left by a crashed process should be treated as stale."""
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "slack-app-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text("") # simulate crash between O_CREAT and json.dump
acquired, existing = status.acquire_scoped_lock("slack-app-token", "secret", metadata={"platform": "slack"})
assert acquired is True
payload = json.loads(lock_path.read_text())
assert payload["pid"] == os.getpid()
assert payload["metadata"]["platform"] == "slack"
def test_acquire_scoped_lock_recovers_corrupt_lock_file(self, tmp_path, monkeypatch):
"""Lock file with invalid JSON should be treated as stale."""
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "slack-app-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text("{truncated") # simulate partial write
acquired, existing = status.acquire_scoped_lock("slack-app-token", "secret", metadata={"platform": "slack"})
assert acquired is True
payload = json.loads(lock_path.read_text())
assert payload["pid"] == os.getpid()
def test_release_scoped_lock_only_removes_current_owner(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
acquired, _ = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
assert acquired is True
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
assert lock_path.exists()
status.release_scoped_lock("telegram-bot-token", "secret")
assert not lock_path.exists()
class TestTakeoverMarker:
"""Tests for the --replace takeover marker.
The marker breaks the post-#5646 flap loop between two gateway services
fighting for the same bot token. The replacer writes a file naming the
target PID + start_time; the target's shutdown handler sees it and exits
0 instead of 1, so systemd's Restart=on-failure doesn't revive it.
"""
def test_write_marker_records_target_identity(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 42)
ok = status.write_takeover_marker(target_pid=12345)
assert ok is True
marker = tmp_path / ".gateway-takeover.json"
assert marker.exists()
payload = json.loads(marker.read_text())
assert payload["target_pid"] == 12345
assert payload["target_start_time"] == 42
assert payload["replacer_pid"] == os.getpid()
assert "written_at" in payload
def test_consume_returns_true_when_marker_names_self(self, tmp_path, monkeypatch):
"""Primary happy path: planned takeover is recognised."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# Mark THIS process as the target
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
ok = status.write_takeover_marker(target_pid=os.getpid())
assert ok is True
# Call consume as if this process just got SIGTERMed
result = status.consume_takeover_marker_for_self()
assert result is True
# Marker must be unlinked after consumption
assert not (tmp_path / ".gateway-takeover.json").exists()
def test_consume_returns_false_for_different_pid(self, tmp_path, monkeypatch):
"""A marker naming a DIFFERENT process must not be consumed as ours."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
# Marker names a different PID
other_pid = os.getpid() + 9999
ok = status.write_takeover_marker(target_pid=other_pid)
assert ok is True
result = status.consume_takeover_marker_for_self()
assert result is False
# Marker IS unlinked even on non-match (the record has been consumed
# and isn't relevant to us — leaving it around would grief a later
# legitimate check).
assert not (tmp_path / ".gateway-takeover.json").exists()
def test_consume_returns_false_on_start_time_mismatch(self, tmp_path, monkeypatch):
"""PID reuse defence: old marker's start_time mismatches current process."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# Marker says target started at time 100 with our PID
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
status.write_takeover_marker(target_pid=os.getpid())
# Now change the reported start_time to simulate PID reuse
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 9999)
result = status.consume_takeover_marker_for_self()
assert result is False
def test_consume_returns_false_when_marker_missing(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
result = status.consume_takeover_marker_for_self()
assert result is False
def test_consume_returns_false_for_stale_marker(self, tmp_path, monkeypatch):
"""A marker older than 60s must be ignored."""
from datetime import datetime, timezone, timedelta
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
marker_path = tmp_path / ".gateway-takeover.json"
# Hand-craft a marker written 2 minutes ago
stale_time = (datetime.now(timezone.utc) - timedelta(minutes=2)).isoformat()
marker_path.write_text(json.dumps({
"target_pid": os.getpid(),
"target_start_time": 123,
"replacer_pid": 99999,
"written_at": stale_time,
}))
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
result = status.consume_takeover_marker_for_self()
assert result is False
# Stale markers are unlinked so a later legit shutdown isn't griefed
assert not marker_path.exists()
def test_consume_handles_malformed_marker_gracefully(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
marker_path = tmp_path / ".gateway-takeover.json"
marker_path.write_text("not valid json{")
# Must not raise
result = status.consume_takeover_marker_for_self()
assert result is False
def test_consume_handles_marker_with_missing_fields(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
marker_path = tmp_path / ".gateway-takeover.json"
marker_path.write_text(json.dumps({"only_replacer_pid": 99999}))
result = status.consume_takeover_marker_for_self()
assert result is False
# Malformed marker should be cleaned up
assert not marker_path.exists()
def test_clear_takeover_marker_is_idempotent(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# Nothing to clear — must not raise
status.clear_takeover_marker()
# Write then clear
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
status.write_takeover_marker(target_pid=12345)
assert (tmp_path / ".gateway-takeover.json").exists()
status.clear_takeover_marker()
assert not (tmp_path / ".gateway-takeover.json").exists()
# Clear again — still no error
status.clear_takeover_marker()
def test_write_marker_returns_false_on_write_failure(self, tmp_path, monkeypatch):
"""write_takeover_marker is best-effort; returns False but doesn't raise."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
def raise_oserror(*args, **kwargs):
raise OSError("simulated write failure")
monkeypatch.setattr(status, "_write_json_file", raise_oserror)
ok = status.write_takeover_marker(target_pid=12345)
assert ok is False
def test_consume_ignores_marker_for_different_process_and_prevents_stale_grief(
self, tmp_path, monkeypatch
):
"""Regression: a stale marker from a dead replacer naming a dead
target must not accidentally cause an unrelated future gateway to
exit 0 on legitimate SIGTERM.
The distinguishing check is ``target_pid == our_pid AND
target_start_time == our_start_time``. Different PID always wins.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
marker_path = tmp_path / ".gateway-takeover.json"
# Fresh marker (timestamp is recent) but names a totally different PID
from datetime import datetime, timezone
marker_path.write_text(json.dumps({
"target_pid": os.getpid() + 10000,
"target_start_time": 42,
"replacer_pid": 99999,
"written_at": datetime.now(timezone.utc).isoformat(),
}))
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 42)
result = status.consume_takeover_marker_for_self()
# We are not the target — must NOT consume as planned
assert result is False