hermes-agent/tests/gateway/test_status.py
Teknium ce9c91c8f7 fix(gateway): close --replace race completely by claiming PID before adapter startup
Follow-up on top of opriz's atomic PID file fix. The prior change caught
the race AFTER runner.start(), so the loser still opened Telegram polling
and Discord gateway sockets before detecting the conflict and exiting.

Hoist the PID-claim block to BEFORE runner.start(). Now the loser of the
O_CREAT|O_EXCL race returns from start_gateway() without ever bringing up
any platform adapter — no Telegram conflict, no Discord duplicate session.

Also add regression tests:
- test_write_pid_file_is_atomic_against_concurrent_writers: second
  write_pid_file() raises FileExistsError rather than clobbering.
- Two existing replace-path tests updated to stateful mocks since the
  real post-kill state (get_running_pid None after remove_pid_file)
  is now exercised by the hoisted re-check.
2026-04-21 00:43:50 -07:00

468 lines
19 KiB
Python

"""Tests for gateway runtime status tracking."""
import json
import os
from types import SimpleNamespace
from gateway import status
class TestGatewayPidState:
def test_write_pid_file_records_gateway_metadata(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
status.write_pid_file()
payload = json.loads((tmp_path / "gateway.pid").read_text())
assert payload["pid"] == os.getpid()
assert payload["kind"] == "hermes-gateway"
assert isinstance(payload["argv"], list)
assert payload["argv"]
def test_write_pid_file_is_atomic_against_concurrent_writers(self, tmp_path, monkeypatch):
"""Regression: two concurrent --replace invocations must not both win.
Without O_CREAT|O_EXCL, two processes racing through start_gateway()'s
termination-wait would both write to gateway.pid, silently overwriting
each other and leaving multiple gateway instances alive (#11718).
"""
import pytest
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# First write wins.
status.write_pid_file()
assert (tmp_path / "gateway.pid").exists()
# Second write (simulating a racing --replace that missed the earlier
# guards) must raise FileExistsError rather than clobber the record.
with pytest.raises(FileExistsError):
status.write_pid_file()
# Original record is preserved.
payload = json.loads((tmp_path / "gateway.pid").read_text())
assert payload["pid"] == os.getpid()
def test_get_running_pid_rejects_live_non_gateway_pid(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
pid_path = tmp_path / "gateway.pid"
pid_path.write_text(str(os.getpid()))
assert status.get_running_pid() is None
assert not pid_path.exists()
def test_get_running_pid_accepts_gateway_metadata_when_cmdline_unavailable(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
pid_path = tmp_path / "gateway.pid"
pid_path.write_text(json.dumps({
"pid": os.getpid(),
"kind": "hermes-gateway",
"argv": ["python", "-m", "hermes_cli.main", "gateway"],
"start_time": 123,
}))
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
assert status.get_running_pid() == os.getpid()
def test_get_running_pid_accepts_script_style_gateway_cmdline(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
pid_path = tmp_path / "gateway.pid"
pid_path.write_text(json.dumps({
"pid": os.getpid(),
"kind": "hermes-gateway",
"argv": ["/venv/bin/python", "/repo/hermes_cli/main.py", "gateway", "run", "--replace"],
"start_time": 123,
}))
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
monkeypatch.setattr(
status,
"_read_process_cmdline",
lambda pid: "/venv/bin/python /repo/hermes_cli/main.py gateway run --replace",
)
assert status.get_running_pid() == os.getpid()
def test_get_running_pid_accepts_explicit_pid_path_without_cleanup(self, tmp_path, monkeypatch):
other_home = tmp_path / "profile-home"
other_home.mkdir()
pid_path = other_home / "gateway.pid"
pid_path.write_text(json.dumps({
"pid": os.getpid(),
"kind": "hermes-gateway",
"argv": ["python", "-m", "hermes_cli.main", "gateway"],
"start_time": 123,
}))
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
assert status.get_running_pid(pid_path, cleanup_stale=False) == os.getpid()
assert pid_path.exists()
class TestGatewayRuntimeStatus:
def test_write_runtime_status_overwrites_stale_pid_on_restart(self, tmp_path, monkeypatch):
"""Regression: setdefault() preserved stale PID from previous process (#1631)."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# Simulate a previous gateway run that left a state file with a stale PID
state_path = tmp_path / "gateway_state.json"
state_path.write_text(json.dumps({
"pid": 99999,
"start_time": 1000.0,
"kind": "hermes-gateway",
"platforms": {},
"updated_at": "2025-01-01T00:00:00Z",
}))
status.write_runtime_status(gateway_state="running")
payload = status.read_runtime_status()
assert payload["pid"] == os.getpid(), "PID should be overwritten, not preserved via setdefault"
assert payload["start_time"] != 1000.0, "start_time should be overwritten on restart"
def test_write_runtime_status_records_platform_failure(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
status.write_runtime_status(
gateway_state="startup_failed",
exit_reason="telegram conflict",
platform="telegram",
platform_state="fatal",
error_code="telegram_polling_conflict",
error_message="another poller is active",
)
payload = status.read_runtime_status()
assert payload["gateway_state"] == "startup_failed"
assert payload["exit_reason"] == "telegram conflict"
assert payload["platforms"]["telegram"]["state"] == "fatal"
assert payload["platforms"]["telegram"]["error_code"] == "telegram_polling_conflict"
assert payload["platforms"]["telegram"]["error_message"] == "another poller is active"
def test_write_runtime_status_explicit_none_clears_stale_fields(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
status.write_runtime_status(
gateway_state="startup_failed",
exit_reason="stale error",
platform="discord",
platform_state="fatal",
error_code="discord_timeout",
error_message="stale platform error",
)
status.write_runtime_status(
gateway_state="running",
exit_reason=None,
platform="discord",
platform_state="connected",
error_code=None,
error_message=None,
)
payload = status.read_runtime_status()
assert payload["gateway_state"] == "running"
assert payload["exit_reason"] is None
assert payload["platforms"]["discord"]["state"] == "connected"
assert payload["platforms"]["discord"]["error_code"] is None
assert payload["platforms"]["discord"]["error_message"] is None
class TestTerminatePid:
def test_force_uses_taskkill_on_windows(self, monkeypatch):
calls = []
monkeypatch.setattr(status, "_IS_WINDOWS", True)
def fake_run(cmd, capture_output=False, text=False, timeout=None):
calls.append((cmd, capture_output, text, timeout))
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(status.subprocess, "run", fake_run)
status.terminate_pid(123, force=True)
assert calls == [
(["taskkill", "/PID", "123", "/T", "/F"], True, True, 10)
]
def test_force_falls_back_to_sigterm_when_taskkill_missing(self, monkeypatch):
calls = []
monkeypatch.setattr(status, "_IS_WINDOWS", True)
def fake_run(*args, **kwargs):
raise FileNotFoundError
def fake_kill(pid, sig):
calls.append((pid, sig))
monkeypatch.setattr(status.subprocess, "run", fake_run)
monkeypatch.setattr(status.os, "kill", fake_kill)
status.terminate_pid(456, force=True)
assert calls == [(456, status.signal.SIGTERM)]
class TestScopedLocks:
def test_acquire_scoped_lock_rejects_live_other_process(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text(json.dumps({
"pid": 99999,
"start_time": 123,
"kind": "hermes-gateway",
}))
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
assert acquired is False
assert existing["pid"] == 99999
def test_acquire_scoped_lock_replaces_stale_record(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text(json.dumps({
"pid": 99999,
"start_time": 123,
"kind": "hermes-gateway",
}))
def fake_kill(pid, sig):
raise ProcessLookupError
monkeypatch.setattr(status.os, "kill", fake_kill)
acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
assert acquired is True
payload = json.loads(lock_path.read_text())
assert payload["pid"] == os.getpid()
assert payload["metadata"]["platform"] == "telegram"
def test_acquire_scoped_lock_recovers_empty_lock_file(self, tmp_path, monkeypatch):
"""Empty lock file (0 bytes) left by a crashed process should be treated as stale."""
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "slack-app-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text("") # simulate crash between O_CREAT and json.dump
acquired, existing = status.acquire_scoped_lock("slack-app-token", "secret", metadata={"platform": "slack"})
assert acquired is True
payload = json.loads(lock_path.read_text())
assert payload["pid"] == os.getpid()
assert payload["metadata"]["platform"] == "slack"
def test_acquire_scoped_lock_recovers_corrupt_lock_file(self, tmp_path, monkeypatch):
"""Lock file with invalid JSON should be treated as stale."""
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "slack-app-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text("{truncated") # simulate partial write
acquired, existing = status.acquire_scoped_lock("slack-app-token", "secret", metadata={"platform": "slack"})
assert acquired is True
payload = json.loads(lock_path.read_text())
assert payload["pid"] == os.getpid()
def test_release_scoped_lock_only_removes_current_owner(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
acquired, _ = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
assert acquired is True
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
assert lock_path.exists()
status.release_scoped_lock("telegram-bot-token", "secret")
assert not lock_path.exists()
class TestTakeoverMarker:
"""Tests for the --replace takeover marker.
The marker breaks the post-#5646 flap loop between two gateway services
fighting for the same bot token. The replacer writes a file naming the
target PID + start_time; the target's shutdown handler sees it and exits
0 instead of 1, so systemd's Restart=on-failure doesn't revive it.
"""
def test_write_marker_records_target_identity(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 42)
ok = status.write_takeover_marker(target_pid=12345)
assert ok is True
marker = tmp_path / ".gateway-takeover.json"
assert marker.exists()
payload = json.loads(marker.read_text())
assert payload["target_pid"] == 12345
assert payload["target_start_time"] == 42
assert payload["replacer_pid"] == os.getpid()
assert "written_at" in payload
def test_consume_returns_true_when_marker_names_self(self, tmp_path, monkeypatch):
"""Primary happy path: planned takeover is recognised."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# Mark THIS process as the target
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
ok = status.write_takeover_marker(target_pid=os.getpid())
assert ok is True
# Call consume as if this process just got SIGTERMed
result = status.consume_takeover_marker_for_self()
assert result is True
# Marker must be unlinked after consumption
assert not (tmp_path / ".gateway-takeover.json").exists()
def test_consume_returns_false_for_different_pid(self, tmp_path, monkeypatch):
"""A marker naming a DIFFERENT process must not be consumed as ours."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
# Marker names a different PID
other_pid = os.getpid() + 9999
ok = status.write_takeover_marker(target_pid=other_pid)
assert ok is True
result = status.consume_takeover_marker_for_self()
assert result is False
# Marker IS unlinked even on non-match (the record has been consumed
# and isn't relevant to us — leaving it around would grief a later
# legitimate check).
assert not (tmp_path / ".gateway-takeover.json").exists()
def test_consume_returns_false_on_start_time_mismatch(self, tmp_path, monkeypatch):
"""PID reuse defence: old marker's start_time mismatches current process."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# Marker says target started at time 100 with our PID
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
status.write_takeover_marker(target_pid=os.getpid())
# Now change the reported start_time to simulate PID reuse
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 9999)
result = status.consume_takeover_marker_for_self()
assert result is False
def test_consume_returns_false_when_marker_missing(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
result = status.consume_takeover_marker_for_self()
assert result is False
def test_consume_returns_false_for_stale_marker(self, tmp_path, monkeypatch):
"""A marker older than 60s must be ignored."""
from datetime import datetime, timezone, timedelta
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
marker_path = tmp_path / ".gateway-takeover.json"
# Hand-craft a marker written 2 minutes ago
stale_time = (datetime.now(timezone.utc) - timedelta(minutes=2)).isoformat()
marker_path.write_text(json.dumps({
"target_pid": os.getpid(),
"target_start_time": 123,
"replacer_pid": 99999,
"written_at": stale_time,
}))
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
result = status.consume_takeover_marker_for_self()
assert result is False
# Stale markers are unlinked so a later legit shutdown isn't griefed
assert not marker_path.exists()
def test_consume_handles_malformed_marker_gracefully(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
marker_path = tmp_path / ".gateway-takeover.json"
marker_path.write_text("not valid json{")
# Must not raise
result = status.consume_takeover_marker_for_self()
assert result is False
def test_consume_handles_marker_with_missing_fields(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
marker_path = tmp_path / ".gateway-takeover.json"
marker_path.write_text(json.dumps({"only_replacer_pid": 99999}))
result = status.consume_takeover_marker_for_self()
assert result is False
# Malformed marker should be cleaned up
assert not marker_path.exists()
def test_clear_takeover_marker_is_idempotent(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# Nothing to clear — must not raise
status.clear_takeover_marker()
# Write then clear
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
status.write_takeover_marker(target_pid=12345)
assert (tmp_path / ".gateway-takeover.json").exists()
status.clear_takeover_marker()
assert not (tmp_path / ".gateway-takeover.json").exists()
# Clear again — still no error
status.clear_takeover_marker()
def test_write_marker_returns_false_on_write_failure(self, tmp_path, monkeypatch):
"""write_takeover_marker is best-effort; returns False but doesn't raise."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
def raise_oserror(*args, **kwargs):
raise OSError("simulated write failure")
monkeypatch.setattr(status, "_write_json_file", raise_oserror)
ok = status.write_takeover_marker(target_pid=12345)
assert ok is False
def test_consume_ignores_marker_for_different_process_and_prevents_stale_grief(
self, tmp_path, monkeypatch
):
"""Regression: a stale marker from a dead replacer naming a dead
target must not accidentally cause an unrelated future gateway to
exit 0 on legitimate SIGTERM.
The distinguishing check is ``target_pid == our_pid AND
target_start_time == our_start_time``. Different PID always wins.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
marker_path = tmp_path / ".gateway-takeover.json"
# Fresh marker (timestamp is recent) but names a totally different PID
from datetime import datetime, timezone
marker_path.write_text(json.dumps({
"target_pid": os.getpid() + 10000,
"target_start_time": 42,
"replacer_pid": 99999,
"written_at": datetime.now(timezone.utc).isoformat(),
}))
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 42)
result = status.consume_takeover_marker_for_self()
# We are not the target — must NOT consume as planned
assert result is False