mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Follow-up on top of opriz's atomic PID file fix. The prior change caught the race AFTER runner.start(), so the loser still opened Telegram polling and Discord gateway sockets before detecting the conflict and exiting. Hoist the PID-claim block to BEFORE runner.start(). Now the loser of the O_CREAT|O_EXCL race returns from start_gateway() without ever bringing up any platform adapter — no Telegram conflict, no Discord duplicate session. Also add regression tests: - test_write_pid_file_is_atomic_against_concurrent_writers: second write_pid_file() raises FileExistsError rather than clobbering. - Two existing replace-path tests updated to stateful mocks since the real post-kill state (get_running_pid None after remove_pid_file) is now exercised by the hoisted re-check.
353 lines
13 KiB
Python
353 lines
13 KiB
Python
import pytest
|
|
from unittest.mock import AsyncMock
|
|
|
|
from gateway.config import GatewayConfig, Platform, PlatformConfig
|
|
from gateway.platforms.base import BasePlatformAdapter
|
|
from gateway.run import GatewayRunner
|
|
from gateway.status import read_runtime_status
|
|
|
|
|
|
class _RetryableFailureAdapter(BasePlatformAdapter):
|
|
def __init__(self):
|
|
super().__init__(PlatformConfig(enabled=True, token="***"), Platform.TELEGRAM)
|
|
|
|
async def connect(self) -> bool:
|
|
self._set_fatal_error(
|
|
"telegram_connect_error",
|
|
"Telegram startup failed: temporary DNS resolution failure.",
|
|
retryable=True,
|
|
)
|
|
return False
|
|
|
|
async def disconnect(self) -> None:
|
|
self._mark_disconnected()
|
|
|
|
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
|
raise NotImplementedError
|
|
|
|
async def get_chat_info(self, chat_id):
|
|
return {"id": chat_id}
|
|
|
|
|
|
class _DisabledAdapter(BasePlatformAdapter):
|
|
def __init__(self):
|
|
super().__init__(PlatformConfig(enabled=False, token="***"), Platform.TELEGRAM)
|
|
|
|
async def connect(self) -> bool:
|
|
raise AssertionError("connect should not be called for disabled platforms")
|
|
|
|
async def disconnect(self) -> None:
|
|
self._mark_disconnected()
|
|
|
|
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
|
raise NotImplementedError
|
|
|
|
async def get_chat_info(self, chat_id):
|
|
return {"id": chat_id}
|
|
|
|
|
|
class _SuccessfulAdapter(BasePlatformAdapter):
|
|
def __init__(self):
|
|
super().__init__(PlatformConfig(enabled=True, token="***"), Platform.DISCORD)
|
|
|
|
async def connect(self) -> bool:
|
|
return True
|
|
|
|
async def disconnect(self) -> None:
|
|
self._mark_disconnected()
|
|
|
|
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
|
raise NotImplementedError
|
|
|
|
async def get_chat_info(self, chat_id):
|
|
return {"id": chat_id}
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch, tmp_path):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
config = GatewayConfig(
|
|
platforms={
|
|
Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")
|
|
},
|
|
sessions_dir=tmp_path / "sessions",
|
|
)
|
|
runner = GatewayRunner(config)
|
|
|
|
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _RetryableFailureAdapter())
|
|
|
|
ok = await runner.start()
|
|
|
|
assert ok is False
|
|
assert runner.should_exit_cleanly is False
|
|
state = read_runtime_status()
|
|
assert state["gateway_state"] == "startup_failed"
|
|
assert "temporary DNS resolution failure" in state["exit_reason"]
|
|
assert state["platforms"]["telegram"]["state"] == "retrying"
|
|
assert state["platforms"]["telegram"]["error_code"] == "telegram_connect_error"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_runner_allows_cron_only_mode_when_no_platforms_are_enabled(monkeypatch, tmp_path):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
config = GatewayConfig(
|
|
platforms={
|
|
Platform.TELEGRAM: PlatformConfig(enabled=False, token="***")
|
|
},
|
|
sessions_dir=tmp_path / "sessions",
|
|
)
|
|
runner = GatewayRunner(config)
|
|
|
|
ok = await runner.start()
|
|
|
|
assert ok is True
|
|
assert runner.should_exit_cleanly is False
|
|
assert runner.adapters == {}
|
|
state = read_runtime_status()
|
|
assert state["gateway_state"] == "running"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_runner_records_connected_platform_state_on_success(monkeypatch, tmp_path):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
config = GatewayConfig(
|
|
platforms={
|
|
Platform.DISCORD: PlatformConfig(enabled=True, token="***")
|
|
},
|
|
sessions_dir=tmp_path / "sessions",
|
|
)
|
|
runner = GatewayRunner(config)
|
|
|
|
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _SuccessfulAdapter())
|
|
monkeypatch.setattr(runner.hooks, "discover_and_load", lambda: None)
|
|
monkeypatch.setattr(runner.hooks, "emit", AsyncMock())
|
|
|
|
ok = await runner.start()
|
|
|
|
assert ok is True
|
|
state = read_runtime_status()
|
|
assert state["gateway_state"] == "running"
|
|
assert state["platforms"]["discord"]["state"] == "connected"
|
|
assert state["platforms"]["discord"]["error_code"] is None
|
|
assert state["platforms"]["discord"]["error_message"] is None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_start_gateway_verbosity_imports_redacting_formatter(monkeypatch, tmp_path):
|
|
"""Verbosity != None must not crash with NameError on RedactingFormatter (#8044)."""
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
class _CleanExitRunner:
|
|
def __init__(self, config):
|
|
self.config = config
|
|
self.should_exit_cleanly = True
|
|
self.exit_reason = None
|
|
self.adapters = {}
|
|
|
|
async def start(self):
|
|
return True
|
|
|
|
async def stop(self):
|
|
return None
|
|
|
|
monkeypatch.setattr("gateway.status.get_running_pid", lambda: None)
|
|
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
|
|
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
|
|
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
|
|
monkeypatch.setattr("gateway.run.GatewayRunner", _CleanExitRunner)
|
|
|
|
from gateway.run import start_gateway
|
|
|
|
# verbosity=1 triggers the code path that uses RedactingFormatter.
|
|
# Before the fix this raised NameError.
|
|
ok = await start_gateway(config=GatewayConfig(), replace=False, verbosity=1)
|
|
|
|
assert ok is True
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_start_gateway_replace_force_uses_terminate_pid(monkeypatch, tmp_path):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
calls = []
|
|
|
|
class _CleanExitRunner:
|
|
def __init__(self, config):
|
|
self.config = config
|
|
self.should_exit_cleanly = True
|
|
self.exit_reason = None
|
|
self.adapters = {}
|
|
|
|
async def start(self):
|
|
return True
|
|
|
|
async def stop(self):
|
|
return None
|
|
|
|
# get_running_pid returns 42 before we kill the old gateway, then None
|
|
# after remove_pid_file() clears the record (reflects real behavior).
|
|
_pid_state = {"alive": True}
|
|
def _mock_get_running_pid():
|
|
return 42 if _pid_state["alive"] else None
|
|
def _mock_remove_pid_file():
|
|
_pid_state["alive"] = False
|
|
monkeypatch.setattr("gateway.status.get_running_pid", _mock_get_running_pid)
|
|
monkeypatch.setattr("gateway.status.remove_pid_file", _mock_remove_pid_file)
|
|
monkeypatch.setattr("gateway.status.release_all_scoped_locks", lambda: 0)
|
|
monkeypatch.setattr("gateway.status.terminate_pid", lambda pid, force=False: calls.append((pid, force)))
|
|
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
|
|
monkeypatch.setattr("gateway.run.os.kill", lambda pid, sig: None)
|
|
monkeypatch.setattr("time.sleep", lambda _: None)
|
|
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
|
|
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
|
|
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
|
|
monkeypatch.setattr("gateway.run.GatewayRunner", _CleanExitRunner)
|
|
|
|
from gateway.run import start_gateway
|
|
|
|
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
|
|
|
|
assert ok is True
|
|
assert calls == [(42, False), (42, True)]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_start_gateway_replace_writes_takeover_marker_before_sigterm(
|
|
monkeypatch, tmp_path
|
|
):
|
|
"""--replace must write a takeover marker BEFORE sending SIGTERM.
|
|
|
|
The marker lets the target's shutdown handler identify the signal as a
|
|
planned takeover (→ exit 0) rather than an unexpected kill (→ exit 1).
|
|
Without the marker, PR #5646's signal-recovery path would revive the
|
|
target via systemd Restart=on-failure, starting a flap loop.
|
|
"""
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
# Record the ORDER of marker-write + terminate_pid calls
|
|
events: list[str] = []
|
|
marker_paths_seen: list = []
|
|
|
|
def record_write_marker(target_pid: int) -> bool:
|
|
events.append(f"write_marker(target_pid={target_pid})")
|
|
# Also check that the marker file actually exists after this call
|
|
marker_paths_seen.append(
|
|
(tmp_path / ".gateway-takeover.json").exists() is False # not yet
|
|
)
|
|
# Actually write the marker so we can verify cleanup later
|
|
from gateway.status import _get_takeover_marker_path, _write_json_file, _get_process_start_time
|
|
_write_json_file(_get_takeover_marker_path(), {
|
|
"target_pid": target_pid,
|
|
"target_start_time": 0,
|
|
"replacer_pid": 100,
|
|
"written_at": "2026-04-17T00:00:00+00:00",
|
|
})
|
|
return True
|
|
|
|
def record_terminate(pid, force=False):
|
|
events.append(f"terminate_pid(pid={pid}, force={force})")
|
|
|
|
class _CleanExitRunner:
|
|
def __init__(self, config):
|
|
self.config = config
|
|
self.should_exit_cleanly = True
|
|
self.exit_reason = None
|
|
self.adapters = {}
|
|
|
|
async def start(self):
|
|
return True
|
|
|
|
async def stop(self):
|
|
return None
|
|
|
|
_pid_state = {"alive": True}
|
|
def _mock_get_running_pid():
|
|
return 42 if _pid_state["alive"] else None
|
|
def _mock_remove_pid_file():
|
|
_pid_state["alive"] = False
|
|
monkeypatch.setattr("gateway.status.get_running_pid", _mock_get_running_pid)
|
|
monkeypatch.setattr("gateway.status.remove_pid_file", _mock_remove_pid_file)
|
|
monkeypatch.setattr("gateway.status.release_all_scoped_locks", lambda: 0)
|
|
monkeypatch.setattr("gateway.status.write_takeover_marker", record_write_marker)
|
|
monkeypatch.setattr("gateway.status.terminate_pid", record_terminate)
|
|
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
|
|
# Simulate old process exiting on first check so we don't loop into force-kill
|
|
monkeypatch.setattr(
|
|
"gateway.run.os.kill",
|
|
lambda pid, sig: (_ for _ in ()).throw(ProcessLookupError()),
|
|
)
|
|
monkeypatch.setattr("time.sleep", lambda _: None)
|
|
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
|
|
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
|
|
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
|
|
monkeypatch.setattr("gateway.run.GatewayRunner", _CleanExitRunner)
|
|
|
|
from gateway.run import start_gateway
|
|
|
|
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
|
|
|
|
assert ok is True
|
|
# Ordering: marker written BEFORE SIGTERM
|
|
assert events[0] == "write_marker(target_pid=42)"
|
|
assert any(e.startswith("terminate_pid(pid=42") for e in events[1:])
|
|
# Marker file cleanup: replacer cleans it after loop completes
|
|
assert not (tmp_path / ".gateway-takeover.json").exists()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_start_gateway_replace_clears_marker_on_permission_denied(
|
|
monkeypatch, tmp_path
|
|
):
|
|
"""If we fail to kill the existing PID (permission denied), clean up the
|
|
marker so it doesn't grief an unrelated future shutdown."""
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
def write_marker(target_pid: int) -> bool:
|
|
from gateway.status import _get_takeover_marker_path, _write_json_file
|
|
_write_json_file(_get_takeover_marker_path(), {
|
|
"target_pid": target_pid,
|
|
"target_start_time": 0,
|
|
"replacer_pid": 100,
|
|
"written_at": "2026-04-17T00:00:00+00:00",
|
|
})
|
|
return True
|
|
|
|
def raise_permission(pid, force=False):
|
|
raise PermissionError("simulated EPERM")
|
|
|
|
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 42)
|
|
monkeypatch.setattr("gateway.status.write_takeover_marker", write_marker)
|
|
monkeypatch.setattr("gateway.status.terminate_pid", raise_permission)
|
|
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
|
|
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
|
|
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
|
|
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
|
|
|
|
from gateway.run import start_gateway
|
|
|
|
# Should return False due to permission error
|
|
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
|
|
|
|
assert ok is False
|
|
# Marker must NOT be left behind
|
|
assert not (tmp_path / ".gateway-takeover.json").exists()
|
|
|
|
|
|
def test_runner_warns_when_docker_gateway_lacks_explicit_output_mount(monkeypatch, tmp_path, caplog):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
monkeypatch.setenv("TERMINAL_ENV", "docker")
|
|
monkeypatch.setenv("TERMINAL_DOCKER_VOLUMES", '["/etc/localtime:/etc/localtime:ro"]')
|
|
config = GatewayConfig(
|
|
platforms={
|
|
Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")
|
|
},
|
|
sessions_dir=tmp_path / "sessions",
|
|
)
|
|
|
|
with caplog.at_level("WARNING"):
|
|
GatewayRunner(config)
|
|
|
|
assert any(
|
|
"host-visible output mount" in record.message
|
|
for record in caplog.records
|
|
)
|