fix(gateway): exit with failure when all platforms fail with retryable errors (#3592)

When all messaging platforms exhaust retries and get queued for background
reconnection, exit with code 1 so systemd Restart=on-failure can restart
the process. Previously the gateway stayed alive as a zombie with no
connected platforms and exit code 0.

Salvaged from PR #3567 by kelsia14. Test updates added.

Co-authored-by: kelsia14 <kelsia14@users.noreply.github.com>
This commit is contained in:
Teknium 2026-03-28 14:25:12 -07:00 committed by GitHub
parent d7c41f3cef
commit 708f187549
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 48 additions and 9 deletions

View file

@ -745,10 +745,22 @@ class GatewayRunner:
logger.error("No connected messaging platforms remain. Shutting down gateway cleanly.")
await self.stop()
elif not self.adapters and self._failed_platforms:
logger.warning(
"No connected messaging platforms remain, but %d platform(s) queued for reconnection",
len(self._failed_platforms),
)
# All platforms are down and queued for background reconnection.
# If the error is retryable, exit with failure so systemd Restart=on-failure
# can restart the process. Otherwise stay alive and keep retrying in background.
if adapter.fatal_error_retryable:
self._exit_reason = adapter.fatal_error_message or "All messaging platforms failed with retryable errors"
self._exit_with_failure = True
logger.error(
"All messaging platforms failed with retryable errors. "
"Shutting down gateway for service restart (systemd will retry)."
)
await self.stop()
else:
logger.warning(
"No connected messaging platforms remain, but %d platform(s) queued for reconnection",
len(self._failed_platforms),
)
def _request_clean_exit(self, reason: str) -> None:
self._exit_cleanly = True

View file

@ -344,6 +344,7 @@ class TestRuntimeDisconnectQueuing:
async def test_retryable_runtime_error_queued_for_reconnect(self):
"""Retryable runtime errors should add the platform to _failed_platforms."""
runner = _make_runner()
runner.stop = AsyncMock()
adapter = StubAdapter(succeed=True)
adapter._set_fatal_error("network_error", "DNS failure", retryable=True)
@ -371,8 +372,12 @@ class TestRuntimeDisconnectQueuing:
assert Platform.TELEGRAM not in runner._failed_platforms
@pytest.mark.asyncio
async def test_retryable_error_prevents_shutdown_when_queued(self):
"""Gateway should not shut down if failed platforms are queued for reconnection."""
async def test_retryable_error_exits_for_service_restart_when_all_down(self):
"""Gateway should exit with failure when all platforms fail with retryable errors.
This lets systemd Restart=on-failure restart the process, which is more
reliable than in-process background reconnection after exhausted retries.
"""
runner = _make_runner()
runner.stop = AsyncMock()
@ -382,7 +387,28 @@ class TestRuntimeDisconnectQueuing:
await runner._handle_adapter_fatal_error(adapter)
# stop() should NOT have been called since we have platforms queued
# stop() SHOULD be called — gateway exits for systemd restart
runner.stop.assert_called_once()
assert runner._exit_with_failure is True
assert Platform.TELEGRAM in runner._failed_platforms
@pytest.mark.asyncio
async def test_retryable_error_no_exit_when_other_adapters_still_connected(self):
"""Gateway should NOT exit if some adapters are still connected."""
runner = _make_runner()
runner.stop = AsyncMock()
failing_adapter = StubAdapter(succeed=True)
failing_adapter._set_fatal_error("network_error", "DNS failure", retryable=True)
runner.adapters[Platform.TELEGRAM] = failing_adapter
# Another adapter is still connected
healthy_adapter = StubAdapter(succeed=True)
runner.adapters[Platform.DISCORD] = healthy_adapter
await runner._handle_adapter_fatal_error(failing_adapter)
# stop() should NOT have been called — Discord is still up
runner.stop.assert_not_called()
assert Platform.TELEGRAM in runner._failed_platforms

View file

@ -89,7 +89,8 @@ async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatc
await runner._handle_adapter_fatal_error(adapter)
# Should NOT shut down — platform is queued for reconnection
runner.stop.assert_not_awaited()
# Should shut down with failure — systemd Restart=on-failure will restart
runner.stop.assert_awaited_once()
assert runner._exit_with_failure is True
assert Platform.WHATSAPP in runner._failed_platforms
assert runner._failed_platforms[Platform.WHATSAPP]["attempts"] == 0