mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(gateway): exit with failure when all platforms fail with retryable errors (#3592)
When all messaging platforms exhaust retries and get queued for background reconnection, exit with code 1 so systemd Restart=on-failure can restart the process. Previously the gateway stayed alive as a zombie with no connected platforms and exit code 0. Salvaged from PR #3567 by kelsia14. Test updates added. Co-authored-by: kelsia14 <kelsia14@users.noreply.github.com>
This commit is contained in:
parent
d7c41f3cef
commit
708f187549
3 changed files with 48 additions and 9 deletions
|
|
@ -745,10 +745,22 @@ class GatewayRunner:
|
|||
logger.error("No connected messaging platforms remain. Shutting down gateway cleanly.")
|
||||
await self.stop()
|
||||
elif not self.adapters and self._failed_platforms:
|
||||
logger.warning(
|
||||
"No connected messaging platforms remain, but %d platform(s) queued for reconnection",
|
||||
len(self._failed_platforms),
|
||||
)
|
||||
# All platforms are down and queued for background reconnection.
|
||||
# If the error is retryable, exit with failure so systemd Restart=on-failure
|
||||
# can restart the process. Otherwise stay alive and keep retrying in background.
|
||||
if adapter.fatal_error_retryable:
|
||||
self._exit_reason = adapter.fatal_error_message or "All messaging platforms failed with retryable errors"
|
||||
self._exit_with_failure = True
|
||||
logger.error(
|
||||
"All messaging platforms failed with retryable errors. "
|
||||
"Shutting down gateway for service restart (systemd will retry)."
|
||||
)
|
||||
await self.stop()
|
||||
else:
|
||||
logger.warning(
|
||||
"No connected messaging platforms remain, but %d platform(s) queued for reconnection",
|
||||
len(self._failed_platforms),
|
||||
)
|
||||
|
||||
def _request_clean_exit(self, reason: str) -> None:
|
||||
self._exit_cleanly = True
|
||||
|
|
|
|||
|
|
@ -344,6 +344,7 @@ class TestRuntimeDisconnectQueuing:
|
|||
async def test_retryable_runtime_error_queued_for_reconnect(self):
|
||||
"""Retryable runtime errors should add the platform to _failed_platforms."""
|
||||
runner = _make_runner()
|
||||
runner.stop = AsyncMock()
|
||||
|
||||
adapter = StubAdapter(succeed=True)
|
||||
adapter._set_fatal_error("network_error", "DNS failure", retryable=True)
|
||||
|
|
@ -371,8 +372,12 @@ class TestRuntimeDisconnectQueuing:
|
|||
assert Platform.TELEGRAM not in runner._failed_platforms
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_retryable_error_prevents_shutdown_when_queued(self):
|
||||
"""Gateway should not shut down if failed platforms are queued for reconnection."""
|
||||
async def test_retryable_error_exits_for_service_restart_when_all_down(self):
|
||||
"""Gateway should exit with failure when all platforms fail with retryable errors.
|
||||
|
||||
This lets systemd Restart=on-failure restart the process, which is more
|
||||
reliable than in-process background reconnection after exhausted retries.
|
||||
"""
|
||||
runner = _make_runner()
|
||||
runner.stop = AsyncMock()
|
||||
|
||||
|
|
@ -382,7 +387,28 @@ class TestRuntimeDisconnectQueuing:
|
|||
|
||||
await runner._handle_adapter_fatal_error(adapter)
|
||||
|
||||
# stop() should NOT have been called since we have platforms queued
|
||||
# stop() SHOULD be called — gateway exits for systemd restart
|
||||
runner.stop.assert_called_once()
|
||||
assert runner._exit_with_failure is True
|
||||
assert Platform.TELEGRAM in runner._failed_platforms
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_retryable_error_no_exit_when_other_adapters_still_connected(self):
|
||||
"""Gateway should NOT exit if some adapters are still connected."""
|
||||
runner = _make_runner()
|
||||
runner.stop = AsyncMock()
|
||||
|
||||
failing_adapter = StubAdapter(succeed=True)
|
||||
failing_adapter._set_fatal_error("network_error", "DNS failure", retryable=True)
|
||||
runner.adapters[Platform.TELEGRAM] = failing_adapter
|
||||
|
||||
# Another adapter is still connected
|
||||
healthy_adapter = StubAdapter(succeed=True)
|
||||
runner.adapters[Platform.DISCORD] = healthy_adapter
|
||||
|
||||
await runner._handle_adapter_fatal_error(failing_adapter)
|
||||
|
||||
# stop() should NOT have been called — Discord is still up
|
||||
runner.stop.assert_not_called()
|
||||
assert Platform.TELEGRAM in runner._failed_platforms
|
||||
|
||||
|
|
|
|||
|
|
@ -89,7 +89,8 @@ async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatc
|
|||
|
||||
await runner._handle_adapter_fatal_error(adapter)
|
||||
|
||||
# Should NOT shut down — platform is queued for reconnection
|
||||
runner.stop.assert_not_awaited()
|
||||
# Should shut down with failure — systemd Restart=on-failure will restart
|
||||
runner.stop.assert_awaited_once()
|
||||
assert runner._exit_with_failure is True
|
||||
assert Platform.WHATSAPP in runner._failed_platforms
|
||||
assert runner._failed_platforms[Platform.WHATSAPP]["attempts"] == 0
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue