mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Follow-up on top of opriz's atomic PID file fix. The prior change caught the race AFTER runner.start(), so the loser still opened Telegram polling and Discord gateway sockets before detecting the conflict and exiting. Hoist the PID-claim block to BEFORE runner.start(). Now the loser of the O_CREAT|O_EXCL race returns from start_gateway() without ever bringing up any platform adapter — no Telegram conflict, no Discord duplicate session. Also add regression tests: - test_write_pid_file_is_atomic_against_concurrent_writers: second write_pid_file() raises FileExistsError rather than clobbering. - Two existing replace-path tests updated to stateful mocks since the real post-kill state (get_running_pid None after remove_pid_file) is now exercised by the hoisted re-check.
468 lines
19 KiB
Python
468 lines
19 KiB
Python
"""Tests for gateway runtime status tracking."""
|
|
|
|
import json
|
|
import os
|
|
from types import SimpleNamespace
|
|
|
|
from gateway import status
|
|
|
|
|
|
class TestGatewayPidState:
|
|
def test_write_pid_file_records_gateway_metadata(self, tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
status.write_pid_file()
|
|
|
|
payload = json.loads((tmp_path / "gateway.pid").read_text())
|
|
assert payload["pid"] == os.getpid()
|
|
assert payload["kind"] == "hermes-gateway"
|
|
assert isinstance(payload["argv"], list)
|
|
assert payload["argv"]
|
|
|
|
def test_write_pid_file_is_atomic_against_concurrent_writers(self, tmp_path, monkeypatch):
|
|
"""Regression: two concurrent --replace invocations must not both win.
|
|
|
|
Without O_CREAT|O_EXCL, two processes racing through start_gateway()'s
|
|
termination-wait would both write to gateway.pid, silently overwriting
|
|
each other and leaving multiple gateway instances alive (#11718).
|
|
"""
|
|
import pytest
|
|
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
# First write wins.
|
|
status.write_pid_file()
|
|
assert (tmp_path / "gateway.pid").exists()
|
|
|
|
# Second write (simulating a racing --replace that missed the earlier
|
|
# guards) must raise FileExistsError rather than clobber the record.
|
|
with pytest.raises(FileExistsError):
|
|
status.write_pid_file()
|
|
|
|
# Original record is preserved.
|
|
payload = json.loads((tmp_path / "gateway.pid").read_text())
|
|
assert payload["pid"] == os.getpid()
|
|
|
|
def test_get_running_pid_rejects_live_non_gateway_pid(self, tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
pid_path = tmp_path / "gateway.pid"
|
|
pid_path.write_text(str(os.getpid()))
|
|
|
|
assert status.get_running_pid() is None
|
|
assert not pid_path.exists()
|
|
|
|
def test_get_running_pid_accepts_gateway_metadata_when_cmdline_unavailable(self, tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
pid_path = tmp_path / "gateway.pid"
|
|
pid_path.write_text(json.dumps({
|
|
"pid": os.getpid(),
|
|
"kind": "hermes-gateway",
|
|
"argv": ["python", "-m", "hermes_cli.main", "gateway"],
|
|
"start_time": 123,
|
|
}))
|
|
|
|
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
|
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
|
|
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
|
|
|
|
assert status.get_running_pid() == os.getpid()
|
|
|
|
def test_get_running_pid_accepts_script_style_gateway_cmdline(self, tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
pid_path = tmp_path / "gateway.pid"
|
|
pid_path.write_text(json.dumps({
|
|
"pid": os.getpid(),
|
|
"kind": "hermes-gateway",
|
|
"argv": ["/venv/bin/python", "/repo/hermes_cli/main.py", "gateway", "run", "--replace"],
|
|
"start_time": 123,
|
|
}))
|
|
|
|
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
|
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
|
|
monkeypatch.setattr(
|
|
status,
|
|
"_read_process_cmdline",
|
|
lambda pid: "/venv/bin/python /repo/hermes_cli/main.py gateway run --replace",
|
|
)
|
|
|
|
assert status.get_running_pid() == os.getpid()
|
|
|
|
def test_get_running_pid_accepts_explicit_pid_path_without_cleanup(self, tmp_path, monkeypatch):
|
|
other_home = tmp_path / "profile-home"
|
|
other_home.mkdir()
|
|
pid_path = other_home / "gateway.pid"
|
|
pid_path.write_text(json.dumps({
|
|
"pid": os.getpid(),
|
|
"kind": "hermes-gateway",
|
|
"argv": ["python", "-m", "hermes_cli.main", "gateway"],
|
|
"start_time": 123,
|
|
}))
|
|
|
|
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
|
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
|
|
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
|
|
|
|
assert status.get_running_pid(pid_path, cleanup_stale=False) == os.getpid()
|
|
assert pid_path.exists()
|
|
|
|
|
|
class TestGatewayRuntimeStatus:
|
|
def test_write_runtime_status_overwrites_stale_pid_on_restart(self, tmp_path, monkeypatch):
|
|
"""Regression: setdefault() preserved stale PID from previous process (#1631)."""
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
# Simulate a previous gateway run that left a state file with a stale PID
|
|
state_path = tmp_path / "gateway_state.json"
|
|
state_path.write_text(json.dumps({
|
|
"pid": 99999,
|
|
"start_time": 1000.0,
|
|
"kind": "hermes-gateway",
|
|
"platforms": {},
|
|
"updated_at": "2025-01-01T00:00:00Z",
|
|
}))
|
|
|
|
status.write_runtime_status(gateway_state="running")
|
|
|
|
payload = status.read_runtime_status()
|
|
assert payload["pid"] == os.getpid(), "PID should be overwritten, not preserved via setdefault"
|
|
assert payload["start_time"] != 1000.0, "start_time should be overwritten on restart"
|
|
|
|
def test_write_runtime_status_records_platform_failure(self, tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
status.write_runtime_status(
|
|
gateway_state="startup_failed",
|
|
exit_reason="telegram conflict",
|
|
platform="telegram",
|
|
platform_state="fatal",
|
|
error_code="telegram_polling_conflict",
|
|
error_message="another poller is active",
|
|
)
|
|
|
|
payload = status.read_runtime_status()
|
|
assert payload["gateway_state"] == "startup_failed"
|
|
assert payload["exit_reason"] == "telegram conflict"
|
|
assert payload["platforms"]["telegram"]["state"] == "fatal"
|
|
assert payload["platforms"]["telegram"]["error_code"] == "telegram_polling_conflict"
|
|
assert payload["platforms"]["telegram"]["error_message"] == "another poller is active"
|
|
|
|
def test_write_runtime_status_explicit_none_clears_stale_fields(self, tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
status.write_runtime_status(
|
|
gateway_state="startup_failed",
|
|
exit_reason="stale error",
|
|
platform="discord",
|
|
platform_state="fatal",
|
|
error_code="discord_timeout",
|
|
error_message="stale platform error",
|
|
)
|
|
|
|
status.write_runtime_status(
|
|
gateway_state="running",
|
|
exit_reason=None,
|
|
platform="discord",
|
|
platform_state="connected",
|
|
error_code=None,
|
|
error_message=None,
|
|
)
|
|
|
|
payload = status.read_runtime_status()
|
|
assert payload["gateway_state"] == "running"
|
|
assert payload["exit_reason"] is None
|
|
assert payload["platforms"]["discord"]["state"] == "connected"
|
|
assert payload["platforms"]["discord"]["error_code"] is None
|
|
assert payload["platforms"]["discord"]["error_message"] is None
|
|
|
|
|
|
class TestTerminatePid:
|
|
def test_force_uses_taskkill_on_windows(self, monkeypatch):
|
|
calls = []
|
|
monkeypatch.setattr(status, "_IS_WINDOWS", True)
|
|
|
|
def fake_run(cmd, capture_output=False, text=False, timeout=None):
|
|
calls.append((cmd, capture_output, text, timeout))
|
|
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
|
|
|
monkeypatch.setattr(status.subprocess, "run", fake_run)
|
|
|
|
status.terminate_pid(123, force=True)
|
|
|
|
assert calls == [
|
|
(["taskkill", "/PID", "123", "/T", "/F"], True, True, 10)
|
|
]
|
|
|
|
def test_force_falls_back_to_sigterm_when_taskkill_missing(self, monkeypatch):
|
|
calls = []
|
|
monkeypatch.setattr(status, "_IS_WINDOWS", True)
|
|
|
|
def fake_run(*args, **kwargs):
|
|
raise FileNotFoundError
|
|
|
|
def fake_kill(pid, sig):
|
|
calls.append((pid, sig))
|
|
|
|
monkeypatch.setattr(status.subprocess, "run", fake_run)
|
|
monkeypatch.setattr(status.os, "kill", fake_kill)
|
|
|
|
status.terminate_pid(456, force=True)
|
|
|
|
assert calls == [(456, status.signal.SIGTERM)]
|
|
|
|
|
|
class TestScopedLocks:
|
|
def test_acquire_scoped_lock_rejects_live_other_process(self, tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
|
|
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
|
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
lock_path.write_text(json.dumps({
|
|
"pid": 99999,
|
|
"start_time": 123,
|
|
"kind": "hermes-gateway",
|
|
}))
|
|
|
|
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
|
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
|
|
|
|
acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
|
|
|
|
assert acquired is False
|
|
assert existing["pid"] == 99999
|
|
|
|
def test_acquire_scoped_lock_replaces_stale_record(self, tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
|
|
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
|
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
lock_path.write_text(json.dumps({
|
|
"pid": 99999,
|
|
"start_time": 123,
|
|
"kind": "hermes-gateway",
|
|
}))
|
|
|
|
def fake_kill(pid, sig):
|
|
raise ProcessLookupError
|
|
|
|
monkeypatch.setattr(status.os, "kill", fake_kill)
|
|
|
|
acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
|
|
|
|
assert acquired is True
|
|
payload = json.loads(lock_path.read_text())
|
|
assert payload["pid"] == os.getpid()
|
|
assert payload["metadata"]["platform"] == "telegram"
|
|
|
|
def test_acquire_scoped_lock_recovers_empty_lock_file(self, tmp_path, monkeypatch):
|
|
"""Empty lock file (0 bytes) left by a crashed process should be treated as stale."""
|
|
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
|
|
lock_path = tmp_path / "locks" / "slack-app-token-2bb80d537b1da3e3.lock"
|
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
lock_path.write_text("") # simulate crash between O_CREAT and json.dump
|
|
|
|
acquired, existing = status.acquire_scoped_lock("slack-app-token", "secret", metadata={"platform": "slack"})
|
|
|
|
assert acquired is True
|
|
payload = json.loads(lock_path.read_text())
|
|
assert payload["pid"] == os.getpid()
|
|
assert payload["metadata"]["platform"] == "slack"
|
|
|
|
def test_acquire_scoped_lock_recovers_corrupt_lock_file(self, tmp_path, monkeypatch):
|
|
"""Lock file with invalid JSON should be treated as stale."""
|
|
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
|
|
lock_path = tmp_path / "locks" / "slack-app-token-2bb80d537b1da3e3.lock"
|
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
lock_path.write_text("{truncated") # simulate partial write
|
|
|
|
acquired, existing = status.acquire_scoped_lock("slack-app-token", "secret", metadata={"platform": "slack"})
|
|
|
|
assert acquired is True
|
|
payload = json.loads(lock_path.read_text())
|
|
assert payload["pid"] == os.getpid()
|
|
|
|
def test_release_scoped_lock_only_removes_current_owner(self, tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
|
|
|
|
acquired, _ = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
|
|
assert acquired is True
|
|
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
|
|
assert lock_path.exists()
|
|
|
|
status.release_scoped_lock("telegram-bot-token", "secret")
|
|
assert not lock_path.exists()
|
|
|
|
|
|
class TestTakeoverMarker:
|
|
"""Tests for the --replace takeover marker.
|
|
|
|
The marker breaks the post-#5646 flap loop between two gateway services
|
|
fighting for the same bot token. The replacer writes a file naming the
|
|
target PID + start_time; the target's shutdown handler sees it and exits
|
|
0 instead of 1, so systemd's Restart=on-failure doesn't revive it.
|
|
"""
|
|
|
|
def test_write_marker_records_target_identity(self, tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 42)
|
|
|
|
ok = status.write_takeover_marker(target_pid=12345)
|
|
|
|
assert ok is True
|
|
marker = tmp_path / ".gateway-takeover.json"
|
|
assert marker.exists()
|
|
payload = json.loads(marker.read_text())
|
|
assert payload["target_pid"] == 12345
|
|
assert payload["target_start_time"] == 42
|
|
assert payload["replacer_pid"] == os.getpid()
|
|
assert "written_at" in payload
|
|
|
|
def test_consume_returns_true_when_marker_names_self(self, tmp_path, monkeypatch):
|
|
"""Primary happy path: planned takeover is recognised."""
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
# Mark THIS process as the target
|
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
|
|
ok = status.write_takeover_marker(target_pid=os.getpid())
|
|
assert ok is True
|
|
|
|
# Call consume as if this process just got SIGTERMed
|
|
result = status.consume_takeover_marker_for_self()
|
|
|
|
assert result is True
|
|
# Marker must be unlinked after consumption
|
|
assert not (tmp_path / ".gateway-takeover.json").exists()
|
|
|
|
def test_consume_returns_false_for_different_pid(self, tmp_path, monkeypatch):
|
|
"""A marker naming a DIFFERENT process must not be consumed as ours."""
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
|
|
# Marker names a different PID
|
|
other_pid = os.getpid() + 9999
|
|
ok = status.write_takeover_marker(target_pid=other_pid)
|
|
assert ok is True
|
|
|
|
result = status.consume_takeover_marker_for_self()
|
|
|
|
assert result is False
|
|
# Marker IS unlinked even on non-match (the record has been consumed
|
|
# and isn't relevant to us — leaving it around would grief a later
|
|
# legitimate check).
|
|
assert not (tmp_path / ".gateway-takeover.json").exists()
|
|
|
|
def test_consume_returns_false_on_start_time_mismatch(self, tmp_path, monkeypatch):
|
|
"""PID reuse defence: old marker's start_time mismatches current process."""
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
# Marker says target started at time 100 with our PID
|
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
|
|
status.write_takeover_marker(target_pid=os.getpid())
|
|
|
|
# Now change the reported start_time to simulate PID reuse
|
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 9999)
|
|
|
|
result = status.consume_takeover_marker_for_self()
|
|
|
|
assert result is False
|
|
|
|
def test_consume_returns_false_when_marker_missing(self, tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
result = status.consume_takeover_marker_for_self()
|
|
|
|
assert result is False
|
|
|
|
def test_consume_returns_false_for_stale_marker(self, tmp_path, monkeypatch):
|
|
"""A marker older than 60s must be ignored."""
|
|
from datetime import datetime, timezone, timedelta
|
|
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
marker_path = tmp_path / ".gateway-takeover.json"
|
|
# Hand-craft a marker written 2 minutes ago
|
|
stale_time = (datetime.now(timezone.utc) - timedelta(minutes=2)).isoformat()
|
|
marker_path.write_text(json.dumps({
|
|
"target_pid": os.getpid(),
|
|
"target_start_time": 123,
|
|
"replacer_pid": 99999,
|
|
"written_at": stale_time,
|
|
}))
|
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
|
|
|
|
result = status.consume_takeover_marker_for_self()
|
|
|
|
assert result is False
|
|
# Stale markers are unlinked so a later legit shutdown isn't griefed
|
|
assert not marker_path.exists()
|
|
|
|
def test_consume_handles_malformed_marker_gracefully(self, tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
marker_path = tmp_path / ".gateway-takeover.json"
|
|
marker_path.write_text("not valid json{")
|
|
|
|
# Must not raise
|
|
result = status.consume_takeover_marker_for_self()
|
|
|
|
assert result is False
|
|
|
|
def test_consume_handles_marker_with_missing_fields(self, tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
marker_path = tmp_path / ".gateway-takeover.json"
|
|
marker_path.write_text(json.dumps({"only_replacer_pid": 99999}))
|
|
|
|
result = status.consume_takeover_marker_for_self()
|
|
|
|
assert result is False
|
|
# Malformed marker should be cleaned up
|
|
assert not marker_path.exists()
|
|
|
|
def test_clear_takeover_marker_is_idempotent(self, tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
# Nothing to clear — must not raise
|
|
status.clear_takeover_marker()
|
|
|
|
# Write then clear
|
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
|
|
status.write_takeover_marker(target_pid=12345)
|
|
assert (tmp_path / ".gateway-takeover.json").exists()
|
|
|
|
status.clear_takeover_marker()
|
|
assert not (tmp_path / ".gateway-takeover.json").exists()
|
|
|
|
# Clear again — still no error
|
|
status.clear_takeover_marker()
|
|
|
|
def test_write_marker_returns_false_on_write_failure(self, tmp_path, monkeypatch):
|
|
"""write_takeover_marker is best-effort; returns False but doesn't raise."""
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
def raise_oserror(*args, **kwargs):
|
|
raise OSError("simulated write failure")
|
|
|
|
monkeypatch.setattr(status, "_write_json_file", raise_oserror)
|
|
|
|
ok = status.write_takeover_marker(target_pid=12345)
|
|
|
|
assert ok is False
|
|
|
|
def test_consume_ignores_marker_for_different_process_and_prevents_stale_grief(
|
|
self, tmp_path, monkeypatch
|
|
):
|
|
"""Regression: a stale marker from a dead replacer naming a dead
|
|
target must not accidentally cause an unrelated future gateway to
|
|
exit 0 on legitimate SIGTERM.
|
|
|
|
The distinguishing check is ``target_pid == our_pid AND
|
|
target_start_time == our_start_time``. Different PID always wins.
|
|
"""
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
marker_path = tmp_path / ".gateway-takeover.json"
|
|
# Fresh marker (timestamp is recent) but names a totally different PID
|
|
from datetime import datetime, timezone
|
|
marker_path.write_text(json.dumps({
|
|
"target_pid": os.getpid() + 10000,
|
|
"target_start_time": 42,
|
|
"replacer_pid": 99999,
|
|
"written_at": datetime.now(timezone.utc).isoformat(),
|
|
}))
|
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 42)
|
|
|
|
result = status.consume_takeover_marker_for_self()
|
|
|
|
# We are not the target — must NOT consume as planned
|
|
assert result is False
|