fix(gateway): default restart_drain_timeout to 0 to kill systemd crash loop (#54066)

A restart now interrupts in-flight agents immediately rather than holding the gateway open for a grace window. The previous 180s default coupled two independently-set timers: the gateway's own drain timer and systemd's TimeoutStopSec. On a stale unit where TimeoutStopSec < drain, systemd SIGKILLed the gateway mid-cleanup, leaving a stale lock that made the next startup exit immediately ('already running') — an infinite crash loop under Restart=on-failure (#31981). Setting drain to 0 makes the mismatch structurally impossible: with drain 0 the generated unit gets TimeoutStopSec=90 against a near-instant drain, so systemd never kills mid-cleanup. Contract: restart the gateway, in-flight work stops. A grace window large enough to 'save' a long agent turn would have to outlast an unbounded task, which is impossible. Also fixes the stale-unit warning's suggested command (hermes gateway service install --replace -> hermes gateway install --force); the former subcommand does not exist. Closes #31981
2026-06-30 11:52:04 +00:00 · 2026-06-28 01:14:34 -07:00 · 2026-06-28 01:14:34 -07:00 · c9df4bc094
commit c9df4bc094
parent 0800f1c28b
4 changed files with 20 additions and 11 deletions
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@ -626,10 +626,10 @@ agent:
  # gateway_timeout_warning: 900

  # Graceful drain timeout for gateway stop/restart (seconds).
-  # The gateway stops accepting new work, waits for in-flight agents to
-  # finish, then interrupts anything still running after this timeout.
-  # 0 = no drain, interrupt immediately.
-  # restart_drain_timeout: 60
+  # Default 0 = no drain: a restart interrupts in-flight agents immediately,
+  # cleans up, and exits. Set a positive value only if you want a grace
+  # window on /restart, and keep it well under systemd's TimeoutStopSec.
+  # restart_drain_timeout: 0

  # Max app-level retry attempts for API errors (connection drops, provider
  # timeouts, 5xx, etc.) before the agent surfaces the failure. Lower this
--- a/gateway/run.py
+++ b/gateway/run.py
@ -5811,7 +5811,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
                logger.warning(
                    "Stale systemd unit detected: %s has TimeoutStopSec=%.0fs but "
                    "drain_timeout=%.0fs (expected >=%.0fs). systemd may SIGKILL the "
-                    "gateway mid-drain. Run `hermes gateway service install --replace` "
+                    "gateway mid-drain. Run `hermes gateway install --force` "
                    "to regenerate the unit, or shorten agent.restart_drain_timeout.",
                    _alignment.get("unit", "(unknown)"),
                    _alignment["timeout_stop_sec"],
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -916,13 +916,18 @@ DEFAULT_CONFIG = {
        # Graceful drain timeout for gateway stop/restart (seconds).
        # The gateway stops accepting new work, waits for running agents
        # to finish, then interrupts any remaining runs after the timeout.
-        # 0 = no drain, interrupt immediately.
+        # 0 = no drain, interrupt immediately (the default).
        #
-        # 180s is calibrated for realistic in-flight agent turns: a typical
-        # coding conversation mid-reasoning runs 60–150s per call, so a 60s
-        # budget routinely interrupted legitimate work on /restart. Raise
-        # further in config.yaml if you run very-long-reasoning models.
-        "restart_drain_timeout": 180,
+        # Contract: if you restart the gateway, in-flight work stops. We do
+        # not hold the restart open for a grace window — a drain timeout
+        # large enough to "save" a long agent turn would have to outlast an
+        # unbounded task (some runs take days), which is impossible, and a
+        # drain timeout shorter than systemd's TimeoutStopSec invites a
+        # SIGKILL-mid-cleanup race that leaves a stale lock and crash-loops
+        # the service. 0 sidesteps both: interrupt now, clean up, exit fast.
+        # Set a positive value in config.yaml only if you explicitly want a
+        # grace window on /restart (and keep it well under TimeoutStopSec).
+        "restart_drain_timeout": 0,
        # Max app-level retry attempts for API errors (connection drops,
        # provider timeouts, 5xx, etc.) before the agent surfaces the
        # failure.  The OpenAI SDK already does its own low-level retries
--- a/tests/gateway/test_gateway_shutdown.py
+++ b/tests/gateway/test_gateway_shutdown.py
@ -94,6 +94,10 @@ async def test_gateway_stop_interrupts_running_agents_and_cancels_adapter_tasks(
@pytest.mark.asyncio
 async def test_gateway_stop_drains_running_agents_before_disconnect():
    runner, adapter = make_restart_runner()
+    # Opt into a grace window (the default is 0 = interrupt immediately).
+    # This exercises the path where an agent finishes within the drain
+    # window and must NOT be interrupted.
+    runner._restart_drain_timeout = 5.0
    disconnect_mock = AsyncMock()
    adapter.disconnect = disconnect_mock