fix(gateway): degrade gracefully when all platform adapters are missing

When connected_count == 0 AND enabled_platform_count > 0, the gateway
treated 'all adapters returned None' identically to 'all adapters
failed to connect' — both as fatal startup errors. The 'returned None'
case happens when imports fail silently or when adapters are present
in config but their dependencies aren't installed (e.g. discord.py
missing). Cron jobs and other gateway-runtime work would unnecessarily
fail to start.

Split: only return False when startup_retryable_errors is non-empty
(real connection attempt failed). When the list is empty AND enabled
> 0, log a warning and continue running, matching the 'no platforms
enabled' cron path.

Salvage of #22642's gateway slice. Drops the bundled run_agent.py
memory-nudge counter hydration block (issue #22357 territory) which
wasn't mentioned in the PR description.

Closes #5196.
This commit is contained in:
Wesley Simplicio 2026-05-09 14:56:03 -07:00 committed by Teknium
parent 116a1446a4
commit 246c676c2b
2 changed files with 65 additions and 10 deletions

View file

@ -3533,16 +3533,30 @@ class GatewayRunner:
self._request_clean_exit(reason)
return True
if enabled_platform_count > 0:
reason = "; ".join(startup_retryable_errors) or "all configured messaging platforms failed to connect"
logger.error("Gateway failed to connect any configured messaging platform: %s", reason)
try:
from gateway.status import write_runtime_status
write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
except Exception:
pass
return False
logger.warning("No messaging platforms enabled.")
logger.info("Gateway will continue running for cron job execution.")
if startup_retryable_errors:
# At least one platform attempted a connection and failed —
# this is a real startup error that should block the gateway.
reason = "; ".join(startup_retryable_errors)
logger.error("Gateway failed to connect any configured messaging platform: %s", reason)
try:
from gateway.status import write_runtime_status
write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
except Exception:
pass
return False
# All enabled platforms had no adapter (missing library or credentials).
# In fleet deployments the same config.yaml is shared across nodes that
# may only have credentials for a subset of platforms. Rather than
# failing hard, degrade gracefully and allow cron jobs to run (#5196).
logger.warning(
"No adapter could be created for any of the %d configured platform(s). "
"Check that required dependencies are installed and credentials are set. "
"Gateway will continue for cron job execution.",
enabled_platform_count,
)
else:
logger.warning("No messaging platforms enabled.")
logger.info("Gateway will continue running for cron job execution.")
# Update delivery router with adapters
self.delivery_router.adapters = self.adapters

View file

@ -339,6 +339,47 @@ async def test_start_gateway_replace_clears_marker_on_permission_denied(
assert not (tmp_path / ".gateway-takeover.json").exists()
@pytest.mark.asyncio
async def test_runner_degrades_gracefully_when_all_adapters_missing(monkeypatch, tmp_path, caplog):
"""When all enabled platforms have no adapter (missing library or credentials),
the gateway should NOT return failure it should warn and continue running for
cron job execution, matching the behaviour of 'no platforms enabled' (#5196).
In fleet deployments the same config.yaml is shared across nodes that may only
have credentials for a subset of platforms. Requiring perfect credentials on
every node makes fleet operation impossible."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
config = GatewayConfig(
platforms={
Platform.TELEGRAM: PlatformConfig(enabled=True, token="***"),
Platform.DISCORD: PlatformConfig(enabled=True, token="***"),
},
sessions_dir=tmp_path / "sessions",
)
runner = GatewayRunner(config)
# Simulate _create_adapter returning None for ALL platforms (missing library /
# missing credentials — no connection attempt ever made).
monkeypatch.setattr(runner, "_create_adapter", lambda platform, cfg: None)
import logging
with caplog.at_level(logging.WARNING):
ok = await runner.start()
# Must NOT return False — gateway should keep running for cron.
assert ok is True
assert runner.should_exit_cleanly is False
assert runner.adapters == {}
# Runtime state must remain "running", not "startup_failed".
state = read_runtime_status()
assert state["gateway_state"] == "running"
# A warning must be emitted explaining why no platforms connected.
assert any(
"No adapter could be created" in record.message
for record in caplog.records
), "Expected degraded-mode warning when all adapters are missing"
def test_runner_warns_when_docker_gateway_lacks_explicit_output_mount(monkeypatch, tmp_path, caplog):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setenv("TERMINAL_ENV", "docker")