mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
Stop the gateway from exiting (or systemd-restart-looping) when a single
messaging adapter fails at startup or runtime. A misconfigured WhatsApp
(npm install timeout, unpaired bridge, missing creds.json) used to take
the entire gateway down, killing cron jobs and any other connected
platforms with it.
Changes:
• Startup (gateway/run.py): when connected_count==0 but the only
errors are retryable, log a degraded-state warning and keep the
gateway alive instead of returning False. Reconnect watcher then
recovers platforms as their underlying problem clears.
• Runtime (gateway/run.py _handle_adapter_fatal_error): when the last
adapter goes down with a retryable error and is queued for
reconnection, stay alive instead of exit-with-failure. Previously
this triggered systemd Restart=on-failure, which created infinite
restart loops on persistent retryable failures (proxy outage,
repeated bridge crashes).
• Reconnect watcher (gateway/run.py _platform_reconnect_watcher):
replace the 20-attempt hard drop with a circuit-breaker pause.
After _PAUSE_AFTER_FAILURES (10) consecutive retryable failures, the
platform stays in _failed_platforms with paused=True so the watcher
skips it but the operator can still see and resume it. Non-retryable
errors still drop out of the queue immediately. Resolves #17063
(gateway giving up on Telegram after 20 attempts).
• WhatsApp preflight (gateway/platforms/whatsapp.py): refuse to start
the Node bridge when creds.json is missing. Sets a non-retryable
whatsapp_not_paired fatal error so the watcher drops it cleanly
with a single 'run hermes whatsapp' log line instead of paying the
30s bridge bootstrap timeout on every gateway start.
• WhatsApp setup ordering (hermes_cli/main.py cmd_whatsapp): only set
WHATSAPP_ENABLED=true once pairing actually succeeds. Previously
the wizard wrote the env var at step 2 (before npm install and QR
pairing), so any Ctrl+C left .env claiming WhatsApp was ready when
the bridge had no creds.json. Also propagate the env var when the
user keeps an existing pairing on a re-run.
• /platform slash command (hermes_cli/commands.py + gateway/run.py):
new gateway-only command for manual circuit-breaker control.
/platform list — show connected + failed/paused platforms
/platform pause <name> — silence a known-broken platform
/platform resume <name> — re-queue a paused platform
Tests:
• New: pause/resume helpers, /platform list|pause|resume command,
WhatsApp creds.json preflight, WhatsApp setup ordering.
• Updated: stale assertions that codified the old 'exit and let
systemd restart' behavior in test_runner_fatal_adapter.py,
test_runner_startup_failures.py, and test_platform_reconnect.py
(the 20-attempt give-up test became a circuit-breaker pause test).
5488 tests pass in tests/gateway/.
409 lines
16 KiB
Python
409 lines
16 KiB
Python
import pytest
|
|
from unittest.mock import AsyncMock
|
|
|
|
from gateway.config import GatewayConfig, Platform, PlatformConfig
|
|
from gateway.platforms.base import BasePlatformAdapter
|
|
from gateway.run import GatewayRunner
|
|
from gateway.status import read_runtime_status
|
|
|
|
|
|
class _RetryableFailureAdapter(BasePlatformAdapter):
|
|
def __init__(self):
|
|
super().__init__(PlatformConfig(enabled=True, token="***"), Platform.TELEGRAM)
|
|
|
|
async def connect(self) -> bool:
|
|
self._set_fatal_error(
|
|
"telegram_connect_error",
|
|
"Telegram startup failed: temporary DNS resolution failure.",
|
|
retryable=True,
|
|
)
|
|
return False
|
|
|
|
async def disconnect(self) -> None:
|
|
self._mark_disconnected()
|
|
|
|
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
|
raise NotImplementedError
|
|
|
|
async def get_chat_info(self, chat_id):
|
|
return {"id": chat_id}
|
|
|
|
|
|
class _DisabledAdapter(BasePlatformAdapter):
|
|
def __init__(self):
|
|
super().__init__(PlatformConfig(enabled=False, token="***"), Platform.TELEGRAM)
|
|
|
|
async def connect(self) -> bool:
|
|
raise AssertionError("connect should not be called for disabled platforms")
|
|
|
|
async def disconnect(self) -> None:
|
|
self._mark_disconnected()
|
|
|
|
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
|
raise NotImplementedError
|
|
|
|
async def get_chat_info(self, chat_id):
|
|
return {"id": chat_id}
|
|
|
|
|
|
class _SuccessfulAdapter(BasePlatformAdapter):
|
|
def __init__(self):
|
|
super().__init__(PlatformConfig(enabled=True, token="***"), Platform.DISCORD)
|
|
|
|
async def connect(self) -> bool:
|
|
return True
|
|
|
|
async def disconnect(self) -> None:
|
|
self._mark_disconnected()
|
|
|
|
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
|
raise NotImplementedError
|
|
|
|
async def get_chat_info(self, chat_id):
|
|
return {"id": chat_id}
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_runner_stays_alive_for_retryable_startup_errors(monkeypatch, tmp_path):
|
|
"""Retryable startup errors should leave the gateway running in
|
|
degraded mode so the reconnect watcher can recover the platform when
|
|
the underlying problem clears. Previously this returned False from
|
|
``start()`` and exited the process, which converted a single broken
|
|
platform (e.g. unpaired WhatsApp, DNS blip on Telegram) into a
|
|
systemd restart loop and killed cron jobs in the meantime.
|
|
"""
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
config = GatewayConfig(
|
|
platforms={
|
|
Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")
|
|
},
|
|
sessions_dir=tmp_path / "sessions",
|
|
)
|
|
runner = GatewayRunner(config)
|
|
|
|
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _RetryableFailureAdapter())
|
|
|
|
ok = await runner.start()
|
|
|
|
# Gateway stays alive in degraded mode; reconnect watcher takes over.
|
|
assert ok is True
|
|
assert runner.should_exit_cleanly is False
|
|
state = read_runtime_status()
|
|
assert state["gateway_state"] in {"degraded", "running"}
|
|
# Telegram was queued for retry, not given up on.
|
|
assert Platform.TELEGRAM in runner._failed_platforms
|
|
assert state["platforms"]["telegram"]["state"] == "retrying"
|
|
assert state["platforms"]["telegram"]["error_code"] == "telegram_connect_error"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_runner_allows_cron_only_mode_when_no_platforms_are_enabled(monkeypatch, tmp_path):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
config = GatewayConfig(
|
|
platforms={
|
|
Platform.TELEGRAM: PlatformConfig(enabled=False, token="***")
|
|
},
|
|
sessions_dir=tmp_path / "sessions",
|
|
)
|
|
runner = GatewayRunner(config)
|
|
|
|
ok = await runner.start()
|
|
|
|
assert ok is True
|
|
assert runner.should_exit_cleanly is False
|
|
assert runner.adapters == {}
|
|
state = read_runtime_status()
|
|
assert state["gateway_state"] == "running"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_runner_records_connected_platform_state_on_success(monkeypatch, tmp_path):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
config = GatewayConfig(
|
|
platforms={
|
|
Platform.DISCORD: PlatformConfig(enabled=True, token="***")
|
|
},
|
|
sessions_dir=tmp_path / "sessions",
|
|
)
|
|
runner = GatewayRunner(config)
|
|
|
|
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _SuccessfulAdapter())
|
|
monkeypatch.setattr(runner.hooks, "discover_and_load", lambda: None)
|
|
monkeypatch.setattr(runner.hooks, "emit", AsyncMock())
|
|
|
|
ok = await runner.start()
|
|
|
|
assert ok is True
|
|
state = read_runtime_status()
|
|
assert state["gateway_state"] == "running"
|
|
assert state["platforms"]["discord"]["state"] == "connected"
|
|
assert state["platforms"]["discord"]["error_code"] is None
|
|
assert state["platforms"]["discord"]["error_message"] is None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_start_gateway_verbosity_imports_redacting_formatter(monkeypatch, tmp_path):
|
|
"""Verbosity != None must not crash with NameError on RedactingFormatter (#8044)."""
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
class _CleanExitRunner:
|
|
def __init__(self, config):
|
|
self.config = config
|
|
self.should_exit_cleanly = True
|
|
self.exit_reason = None
|
|
self.adapters = {}
|
|
|
|
async def start(self):
|
|
return True
|
|
|
|
async def stop(self):
|
|
return None
|
|
|
|
monkeypatch.setattr("gateway.status.get_running_pid", lambda: None)
|
|
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
|
|
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
|
|
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
|
|
monkeypatch.setattr("gateway.run.GatewayRunner", _CleanExitRunner)
|
|
|
|
from gateway.run import start_gateway
|
|
|
|
# verbosity=1 triggers the code path that uses RedactingFormatter.
|
|
# Before the fix this raised NameError.
|
|
ok = await start_gateway(config=GatewayConfig(), replace=False, verbosity=1)
|
|
|
|
assert ok is True
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_start_gateway_replace_force_uses_terminate_pid(monkeypatch, tmp_path):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
calls = []
|
|
|
|
class _CleanExitRunner:
|
|
def __init__(self, config):
|
|
self.config = config
|
|
self.should_exit_cleanly = True
|
|
self.exit_reason = None
|
|
self.adapters = {}
|
|
|
|
async def start(self):
|
|
return True
|
|
|
|
async def stop(self):
|
|
return None
|
|
|
|
# get_running_pid returns 42 before we kill the old gateway, then None
|
|
# after remove_pid_file() clears the record (reflects real behavior).
|
|
_pid_state = {"alive": True}
|
|
def _mock_get_running_pid():
|
|
return 42 if _pid_state["alive"] else None
|
|
def _mock_remove_pid_file():
|
|
_pid_state["alive"] = False
|
|
monkeypatch.setattr("gateway.status.get_running_pid", _mock_get_running_pid)
|
|
monkeypatch.setattr("gateway.status.remove_pid_file", _mock_remove_pid_file)
|
|
monkeypatch.setattr(
|
|
"gateway.status.release_all_scoped_locks",
|
|
lambda **kwargs: 0,
|
|
)
|
|
monkeypatch.setattr("gateway.status.terminate_pid", lambda pid, force=False: calls.append((pid, force)))
|
|
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
|
|
monkeypatch.setattr("gateway.run.os.kill", lambda pid, sig: None)
|
|
monkeypatch.setattr("time.sleep", lambda _: None)
|
|
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
|
|
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
|
|
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
|
|
monkeypatch.setattr("gateway.run.GatewayRunner", _CleanExitRunner)
|
|
|
|
from gateway.run import start_gateway
|
|
|
|
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
|
|
|
|
assert ok is True
|
|
assert calls == [(42, False), (42, True)]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_start_gateway_replace_writes_takeover_marker_before_sigterm(
|
|
monkeypatch, tmp_path
|
|
):
|
|
"""--replace must write a takeover marker BEFORE sending SIGTERM.
|
|
|
|
The marker lets the target's shutdown handler identify the signal as a
|
|
planned takeover (→ exit 0) rather than an unexpected kill (→ exit 1).
|
|
Without the marker, PR #5646's signal-recovery path would revive the
|
|
target via systemd Restart=on-failure, starting a flap loop.
|
|
"""
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
# Record the ORDER of marker-write + terminate_pid calls
|
|
events: list[str] = []
|
|
marker_paths_seen: list = []
|
|
|
|
def record_write_marker(target_pid: int) -> bool:
|
|
events.append(f"write_marker(target_pid={target_pid})")
|
|
# Also check that the marker file actually exists after this call
|
|
marker_paths_seen.append(
|
|
(tmp_path / ".gateway-takeover.json").exists() is False # not yet
|
|
)
|
|
# Actually write the marker so we can verify cleanup later
|
|
from gateway.status import _get_takeover_marker_path, _write_json_file, _get_process_start_time
|
|
_write_json_file(_get_takeover_marker_path(), {
|
|
"target_pid": target_pid,
|
|
"target_start_time": 0,
|
|
"replacer_pid": 100,
|
|
"written_at": "2026-04-17T00:00:00+00:00",
|
|
})
|
|
return True
|
|
|
|
def record_terminate(pid, force=False):
|
|
events.append(f"terminate_pid(pid={pid}, force={force})")
|
|
|
|
class _CleanExitRunner:
|
|
def __init__(self, config):
|
|
self.config = config
|
|
self.should_exit_cleanly = True
|
|
self.exit_reason = None
|
|
self.adapters = {}
|
|
|
|
async def start(self):
|
|
return True
|
|
|
|
async def stop(self):
|
|
return None
|
|
|
|
_pid_state = {"alive": True}
|
|
def _mock_get_running_pid():
|
|
return 42 if _pid_state["alive"] else None
|
|
def _mock_remove_pid_file():
|
|
_pid_state["alive"] = False
|
|
monkeypatch.setattr("gateway.status.get_running_pid", _mock_get_running_pid)
|
|
monkeypatch.setattr("gateway.status.remove_pid_file", _mock_remove_pid_file)
|
|
monkeypatch.setattr(
|
|
"gateway.status.release_all_scoped_locks",
|
|
lambda **kwargs: 0,
|
|
)
|
|
monkeypatch.setattr("gateway.status.write_takeover_marker", record_write_marker)
|
|
monkeypatch.setattr("gateway.status.terminate_pid", record_terminate)
|
|
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
|
|
# Simulate old process exiting on first check so we don't loop into force-kill
|
|
monkeypatch.setattr(
|
|
"gateway.run.os.kill",
|
|
lambda pid, sig: (_ for _ in ()).throw(ProcessLookupError()),
|
|
)
|
|
monkeypatch.setattr("time.sleep", lambda _: None)
|
|
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
|
|
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
|
|
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
|
|
monkeypatch.setattr("gateway.run.GatewayRunner", _CleanExitRunner)
|
|
|
|
from gateway.run import start_gateway
|
|
|
|
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
|
|
|
|
assert ok is True
|
|
# Ordering: marker written BEFORE SIGTERM
|
|
assert events[0] == "write_marker(target_pid=42)"
|
|
assert any(e.startswith("terminate_pid(pid=42") for e in events[1:])
|
|
# Marker file cleanup: replacer cleans it after loop completes
|
|
assert not (tmp_path / ".gateway-takeover.json").exists()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_start_gateway_replace_clears_marker_on_permission_denied(
|
|
monkeypatch, tmp_path
|
|
):
|
|
"""If we fail to kill the existing PID (permission denied), clean up the
|
|
marker so it doesn't grief an unrelated future shutdown."""
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
|
|
def write_marker(target_pid: int) -> bool:
|
|
from gateway.status import _get_takeover_marker_path, _write_json_file
|
|
_write_json_file(_get_takeover_marker_path(), {
|
|
"target_pid": target_pid,
|
|
"target_start_time": 0,
|
|
"replacer_pid": 100,
|
|
"written_at": "2026-04-17T00:00:00+00:00",
|
|
})
|
|
return True
|
|
|
|
def raise_permission(pid, force=False):
|
|
raise PermissionError("simulated EPERM")
|
|
|
|
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 42)
|
|
monkeypatch.setattr("gateway.status.write_takeover_marker", write_marker)
|
|
monkeypatch.setattr("gateway.status.terminate_pid", raise_permission)
|
|
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
|
|
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
|
|
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
|
|
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
|
|
|
|
from gateway.run import start_gateway
|
|
|
|
# Should return False due to permission error
|
|
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
|
|
|
|
assert ok is False
|
|
# Marker must NOT be left behind
|
|
assert not (tmp_path / ".gateway-takeover.json").exists()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_runner_degrades_gracefully_when_all_adapters_missing(monkeypatch, tmp_path, caplog):
|
|
"""When all enabled platforms have no adapter (missing library or credentials),
|
|
the gateway should NOT return failure — it should warn and continue running for
|
|
cron job execution, matching the behaviour of 'no platforms enabled' (#5196).
|
|
|
|
In fleet deployments the same config.yaml is shared across nodes that may only
|
|
have credentials for a subset of platforms. Requiring perfect credentials on
|
|
every node makes fleet operation impossible."""
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
config = GatewayConfig(
|
|
platforms={
|
|
Platform.TELEGRAM: PlatformConfig(enabled=True, token="***"),
|
|
Platform.DISCORD: PlatformConfig(enabled=True, token="***"),
|
|
},
|
|
sessions_dir=tmp_path / "sessions",
|
|
)
|
|
runner = GatewayRunner(config)
|
|
|
|
# Simulate _create_adapter returning None for ALL platforms (missing library /
|
|
# missing credentials — no connection attempt ever made).
|
|
monkeypatch.setattr(runner, "_create_adapter", lambda platform, cfg: None)
|
|
|
|
import logging
|
|
with caplog.at_level(logging.WARNING):
|
|
ok = await runner.start()
|
|
|
|
# Must NOT return False — gateway should keep running for cron.
|
|
assert ok is True
|
|
assert runner.should_exit_cleanly is False
|
|
assert runner.adapters == {}
|
|
# Runtime state must remain "running", not "startup_failed".
|
|
state = read_runtime_status()
|
|
assert state["gateway_state"] == "running"
|
|
# A warning must be emitted explaining why no platforms connected.
|
|
assert any(
|
|
"No adapter could be created" in record.message
|
|
for record in caplog.records
|
|
), "Expected degraded-mode warning when all adapters are missing"
|
|
|
|
|
|
def test_runner_warns_when_docker_gateway_lacks_explicit_output_mount(monkeypatch, tmp_path, caplog):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
monkeypatch.setenv("TERMINAL_ENV", "docker")
|
|
monkeypatch.setenv("TERMINAL_DOCKER_VOLUMES", '["/etc/localtime:/etc/localtime:ro"]')
|
|
config = GatewayConfig(
|
|
platforms={
|
|
Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")
|
|
},
|
|
sessions_dir=tmp_path / "sessions",
|
|
)
|
|
|
|
with caplog.at_level("WARNING"):
|
|
GatewayRunner(config)
|
|
|
|
assert any(
|
|
"host-visible output mount" in record.message
|
|
for record in caplog.records
|
|
)
|