From c9df4bc094fb27ddfc8b278380d9598dc534587b Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sun, 28 Jun 2026 01:14:34 -0700 Subject: [PATCH] fix(gateway): default restart_drain_timeout to 0 to kill systemd crash loop (#54066) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A restart now interrupts in-flight agents immediately rather than holding the gateway open for a grace window. The previous 180s default coupled two independently-set timers: the gateway's own drain timer and systemd's TimeoutStopSec. On a stale unit where TimeoutStopSec < drain, systemd SIGKILLed the gateway mid-cleanup, leaving a stale lock that made the next startup exit immediately ('already running') — an infinite crash loop under Restart=on-failure (#31981). Setting drain to 0 makes the mismatch structurally impossible: with drain 0 the generated unit gets TimeoutStopSec=90 against a near-instant drain, so systemd never kills mid-cleanup. Contract: restart the gateway, in-flight work stops. A grace window large enough to 'save' a long agent turn would have to outlast an unbounded task, which is impossible. Also fixes the stale-unit warning's suggested command (hermes gateway service install --replace -> hermes gateway install --force); the former subcommand does not exist. Closes #31981 --- cli-config.yaml.example | 8 ++++---- gateway/run.py | 2 +- hermes_cli/config.py | 17 +++++++++++------ tests/gateway/test_gateway_shutdown.py | 4 ++++ 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/cli-config.yaml.example b/cli-config.yaml.example index 961e783522b..81644d8907b 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -626,10 +626,10 @@ agent: # gateway_timeout_warning: 900 # Graceful drain timeout for gateway stop/restart (seconds). - # The gateway stops accepting new work, waits for in-flight agents to - # finish, then interrupts anything still running after this timeout. - # 0 = no drain, interrupt immediately. - # restart_drain_timeout: 60 + # Default 0 = no drain: a restart interrupts in-flight agents immediately, + # cleans up, and exits. Set a positive value only if you want a grace + # window on /restart, and keep it well under systemd's TimeoutStopSec. + # restart_drain_timeout: 0 # Max app-level retry attempts for API errors (connection drops, provider # timeouts, 5xx, etc.) before the agent surfaces the failure. Lower this diff --git a/gateway/run.py b/gateway/run.py index f8e8232f36c..1712a43c501 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -5811,7 +5811,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew logger.warning( "Stale systemd unit detected: %s has TimeoutStopSec=%.0fs but " "drain_timeout=%.0fs (expected >=%.0fs). systemd may SIGKILL the " - "gateway mid-drain. Run `hermes gateway service install --replace` " + "gateway mid-drain. Run `hermes gateway install --force` " "to regenerate the unit, or shorten agent.restart_drain_timeout.", _alignment.get("unit", "(unknown)"), _alignment["timeout_stop_sec"], diff --git a/hermes_cli/config.py b/hermes_cli/config.py index c77b8ab3ee3..705d83a4512 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -916,13 +916,18 @@ DEFAULT_CONFIG = { # Graceful drain timeout for gateway stop/restart (seconds). # The gateway stops accepting new work, waits for running agents # to finish, then interrupts any remaining runs after the timeout. - # 0 = no drain, interrupt immediately. + # 0 = no drain, interrupt immediately (the default). # - # 180s is calibrated for realistic in-flight agent turns: a typical - # coding conversation mid-reasoning runs 60–150s per call, so a 60s - # budget routinely interrupted legitimate work on /restart. Raise - # further in config.yaml if you run very-long-reasoning models. - "restart_drain_timeout": 180, + # Contract: if you restart the gateway, in-flight work stops. We do + # not hold the restart open for a grace window — a drain timeout + # large enough to "save" a long agent turn would have to outlast an + # unbounded task (some runs take days), which is impossible, and a + # drain timeout shorter than systemd's TimeoutStopSec invites a + # SIGKILL-mid-cleanup race that leaves a stale lock and crash-loops + # the service. 0 sidesteps both: interrupt now, clean up, exit fast. + # Set a positive value in config.yaml only if you explicitly want a + # grace window on /restart (and keep it well under TimeoutStopSec). + "restart_drain_timeout": 0, # Max app-level retry attempts for API errors (connection drops, # provider timeouts, 5xx, etc.) before the agent surfaces the # failure. The OpenAI SDK already does its own low-level retries diff --git a/tests/gateway/test_gateway_shutdown.py b/tests/gateway/test_gateway_shutdown.py index 25f9c123557..0df0cb90411 100644 --- a/tests/gateway/test_gateway_shutdown.py +++ b/tests/gateway/test_gateway_shutdown.py @@ -94,6 +94,10 @@ async def test_gateway_stop_interrupts_running_agents_and_cancels_adapter_tasks( @pytest.mark.asyncio async def test_gateway_stop_drains_running_agents_before_disconnect(): runner, adapter = make_restart_runner() + # Opt into a grace window (the default is 0 = interrupt immediately). + # This exercises the path where an agent finishes within the drain + # window and must NOT be interrupted. + runner._restart_drain_timeout = 5.0 disconnect_mock = AsyncMock() adapter.disconnect = disconnect_mock