hermes-agent/tests/gateway/test_runner_startup_failures.py
Teknium ce9c91c8f7 fix(gateway): close --replace race completely by claiming PID before adapter startup
Follow-up on top of opriz's atomic PID file fix. The prior change caught
the race AFTER runner.start(), so the loser still opened Telegram polling
and Discord gateway sockets before detecting the conflict and exiting.

Hoist the PID-claim block to BEFORE runner.start(). Now the loser of the
O_CREAT|O_EXCL race returns from start_gateway() without ever bringing up
any platform adapter — no Telegram conflict, no Discord duplicate session.

Also add regression tests:
- test_write_pid_file_is_atomic_against_concurrent_writers: second
  write_pid_file() raises FileExistsError rather than clobbering.
- Two existing replace-path tests updated to stateful mocks since the
  real post-kill state (get_running_pid None after remove_pid_file)
  is now exercised by the hoisted re-check.
2026-04-21 00:43:50 -07:00

353 lines
13 KiB
Python

import pytest
from unittest.mock import AsyncMock
from gateway.config import GatewayConfig, Platform, PlatformConfig
from gateway.platforms.base import BasePlatformAdapter
from gateway.run import GatewayRunner
from gateway.status import read_runtime_status
class _RetryableFailureAdapter(BasePlatformAdapter):
def __init__(self):
super().__init__(PlatformConfig(enabled=True, token="***"), Platform.TELEGRAM)
async def connect(self) -> bool:
self._set_fatal_error(
"telegram_connect_error",
"Telegram startup failed: temporary DNS resolution failure.",
retryable=True,
)
return False
async def disconnect(self) -> None:
self._mark_disconnected()
async def send(self, chat_id, content, reply_to=None, metadata=None):
raise NotImplementedError
async def get_chat_info(self, chat_id):
return {"id": chat_id}
class _DisabledAdapter(BasePlatformAdapter):
def __init__(self):
super().__init__(PlatformConfig(enabled=False, token="***"), Platform.TELEGRAM)
async def connect(self) -> bool:
raise AssertionError("connect should not be called for disabled platforms")
async def disconnect(self) -> None:
self._mark_disconnected()
async def send(self, chat_id, content, reply_to=None, metadata=None):
raise NotImplementedError
async def get_chat_info(self, chat_id):
return {"id": chat_id}
class _SuccessfulAdapter(BasePlatformAdapter):
def __init__(self):
super().__init__(PlatformConfig(enabled=True, token="***"), Platform.DISCORD)
async def connect(self) -> bool:
return True
async def disconnect(self) -> None:
self._mark_disconnected()
async def send(self, chat_id, content, reply_to=None, metadata=None):
raise NotImplementedError
async def get_chat_info(self, chat_id):
return {"id": chat_id}
@pytest.mark.asyncio
async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch, tmp_path):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
config = GatewayConfig(
platforms={
Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")
},
sessions_dir=tmp_path / "sessions",
)
runner = GatewayRunner(config)
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _RetryableFailureAdapter())
ok = await runner.start()
assert ok is False
assert runner.should_exit_cleanly is False
state = read_runtime_status()
assert state["gateway_state"] == "startup_failed"
assert "temporary DNS resolution failure" in state["exit_reason"]
assert state["platforms"]["telegram"]["state"] == "retrying"
assert state["platforms"]["telegram"]["error_code"] == "telegram_connect_error"
@pytest.mark.asyncio
async def test_runner_allows_cron_only_mode_when_no_platforms_are_enabled(monkeypatch, tmp_path):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
config = GatewayConfig(
platforms={
Platform.TELEGRAM: PlatformConfig(enabled=False, token="***")
},
sessions_dir=tmp_path / "sessions",
)
runner = GatewayRunner(config)
ok = await runner.start()
assert ok is True
assert runner.should_exit_cleanly is False
assert runner.adapters == {}
state = read_runtime_status()
assert state["gateway_state"] == "running"
@pytest.mark.asyncio
async def test_runner_records_connected_platform_state_on_success(monkeypatch, tmp_path):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
config = GatewayConfig(
platforms={
Platform.DISCORD: PlatformConfig(enabled=True, token="***")
},
sessions_dir=tmp_path / "sessions",
)
runner = GatewayRunner(config)
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _SuccessfulAdapter())
monkeypatch.setattr(runner.hooks, "discover_and_load", lambda: None)
monkeypatch.setattr(runner.hooks, "emit", AsyncMock())
ok = await runner.start()
assert ok is True
state = read_runtime_status()
assert state["gateway_state"] == "running"
assert state["platforms"]["discord"]["state"] == "connected"
assert state["platforms"]["discord"]["error_code"] is None
assert state["platforms"]["discord"]["error_message"] is None
@pytest.mark.asyncio
async def test_start_gateway_verbosity_imports_redacting_formatter(monkeypatch, tmp_path):
"""Verbosity != None must not crash with NameError on RedactingFormatter (#8044)."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
class _CleanExitRunner:
def __init__(self, config):
self.config = config
self.should_exit_cleanly = True
self.exit_reason = None
self.adapters = {}
async def start(self):
return True
async def stop(self):
return None
monkeypatch.setattr("gateway.status.get_running_pid", lambda: None)
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
monkeypatch.setattr("gateway.run.GatewayRunner", _CleanExitRunner)
from gateway.run import start_gateway
# verbosity=1 triggers the code path that uses RedactingFormatter.
# Before the fix this raised NameError.
ok = await start_gateway(config=GatewayConfig(), replace=False, verbosity=1)
assert ok is True
@pytest.mark.asyncio
async def test_start_gateway_replace_force_uses_terminate_pid(monkeypatch, tmp_path):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
calls = []
class _CleanExitRunner:
def __init__(self, config):
self.config = config
self.should_exit_cleanly = True
self.exit_reason = None
self.adapters = {}
async def start(self):
return True
async def stop(self):
return None
# get_running_pid returns 42 before we kill the old gateway, then None
# after remove_pid_file() clears the record (reflects real behavior).
_pid_state = {"alive": True}
def _mock_get_running_pid():
return 42 if _pid_state["alive"] else None
def _mock_remove_pid_file():
_pid_state["alive"] = False
monkeypatch.setattr("gateway.status.get_running_pid", _mock_get_running_pid)
monkeypatch.setattr("gateway.status.remove_pid_file", _mock_remove_pid_file)
monkeypatch.setattr("gateway.status.release_all_scoped_locks", lambda: 0)
monkeypatch.setattr("gateway.status.terminate_pid", lambda pid, force=False: calls.append((pid, force)))
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
monkeypatch.setattr("gateway.run.os.kill", lambda pid, sig: None)
monkeypatch.setattr("time.sleep", lambda _: None)
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
monkeypatch.setattr("gateway.run.GatewayRunner", _CleanExitRunner)
from gateway.run import start_gateway
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
assert ok is True
assert calls == [(42, False), (42, True)]
@pytest.mark.asyncio
async def test_start_gateway_replace_writes_takeover_marker_before_sigterm(
monkeypatch, tmp_path
):
"""--replace must write a takeover marker BEFORE sending SIGTERM.
The marker lets the target's shutdown handler identify the signal as a
planned takeover (→ exit 0) rather than an unexpected kill (→ exit 1).
Without the marker, PR #5646's signal-recovery path would revive the
target via systemd Restart=on-failure, starting a flap loop.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# Record the ORDER of marker-write + terminate_pid calls
events: list[str] = []
marker_paths_seen: list = []
def record_write_marker(target_pid: int) -> bool:
events.append(f"write_marker(target_pid={target_pid})")
# Also check that the marker file actually exists after this call
marker_paths_seen.append(
(tmp_path / ".gateway-takeover.json").exists() is False # not yet
)
# Actually write the marker so we can verify cleanup later
from gateway.status import _get_takeover_marker_path, _write_json_file, _get_process_start_time
_write_json_file(_get_takeover_marker_path(), {
"target_pid": target_pid,
"target_start_time": 0,
"replacer_pid": 100,
"written_at": "2026-04-17T00:00:00+00:00",
})
return True
def record_terminate(pid, force=False):
events.append(f"terminate_pid(pid={pid}, force={force})")
class _CleanExitRunner:
def __init__(self, config):
self.config = config
self.should_exit_cleanly = True
self.exit_reason = None
self.adapters = {}
async def start(self):
return True
async def stop(self):
return None
_pid_state = {"alive": True}
def _mock_get_running_pid():
return 42 if _pid_state["alive"] else None
def _mock_remove_pid_file():
_pid_state["alive"] = False
monkeypatch.setattr("gateway.status.get_running_pid", _mock_get_running_pid)
monkeypatch.setattr("gateway.status.remove_pid_file", _mock_remove_pid_file)
monkeypatch.setattr("gateway.status.release_all_scoped_locks", lambda: 0)
monkeypatch.setattr("gateway.status.write_takeover_marker", record_write_marker)
monkeypatch.setattr("gateway.status.terminate_pid", record_terminate)
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
# Simulate old process exiting on first check so we don't loop into force-kill
monkeypatch.setattr(
"gateway.run.os.kill",
lambda pid, sig: (_ for _ in ()).throw(ProcessLookupError()),
)
monkeypatch.setattr("time.sleep", lambda _: None)
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
monkeypatch.setattr("gateway.run.GatewayRunner", _CleanExitRunner)
from gateway.run import start_gateway
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
assert ok is True
# Ordering: marker written BEFORE SIGTERM
assert events[0] == "write_marker(target_pid=42)"
assert any(e.startswith("terminate_pid(pid=42") for e in events[1:])
# Marker file cleanup: replacer cleans it after loop completes
assert not (tmp_path / ".gateway-takeover.json").exists()
@pytest.mark.asyncio
async def test_start_gateway_replace_clears_marker_on_permission_denied(
monkeypatch, tmp_path
):
"""If we fail to kill the existing PID (permission denied), clean up the
marker so it doesn't grief an unrelated future shutdown."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
def write_marker(target_pid: int) -> bool:
from gateway.status import _get_takeover_marker_path, _write_json_file
_write_json_file(_get_takeover_marker_path(), {
"target_pid": target_pid,
"target_start_time": 0,
"replacer_pid": 100,
"written_at": "2026-04-17T00:00:00+00:00",
})
return True
def raise_permission(pid, force=False):
raise PermissionError("simulated EPERM")
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 42)
monkeypatch.setattr("gateway.status.write_takeover_marker", write_marker)
monkeypatch.setattr("gateway.status.terminate_pid", raise_permission)
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
from gateway.run import start_gateway
# Should return False due to permission error
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
assert ok is False
# Marker must NOT be left behind
assert not (tmp_path / ".gateway-takeover.json").exists()
def test_runner_warns_when_docker_gateway_lacks_explicit_output_mount(monkeypatch, tmp_path, caplog):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setenv("TERMINAL_ENV", "docker")
monkeypatch.setenv("TERMINAL_DOCKER_VOLUMES", '["/etc/localtime:/etc/localtime:ro"]')
config = GatewayConfig(
platforms={
Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")
},
sessions_dir=tmp_path / "sessions",
)
with caplog.at_level("WARNING"):
GatewayRunner(config)
assert any(
"host-visible output mount" in record.message
for record in caplog.records
)