From 40d7c264f0471d8b7fce39ef12d49cb02eb18b0e Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sun, 14 Jun 2026 18:43:23 -0700 Subject: [PATCH] fix(s6): register profile gateways without auto-starting (#46266) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(s6): prevent profile create from auto-starting gateway service When hermes profile create runs inside an s6 container, _maybe_register_gateway_service() calls register_profile_gateway() which creates the service directory and triggers s6-svscanctl -a. Previously the service always started immediately, causing profiles that share the main gateway's bot token (e.g. Kanban worker profiles) to fail with a token-lock conflict and persist gateway_state: running — becoming zombies that resurrect on every container restart. Wire the existing start_now parameter through the S6 implementation: when start_now=False, write a marker file (same pattern as container_boot.py _register_gateway_slot) so s6-supervise leaves the service stopped until the user explicitly runs hermes -p gateway start. 4 files, +61/-6, 4 new tests (all passing). * test(docker): wait for gateway running state before restart --------- Co-authored-by: liuhao1024 --- hermes_cli/profiles.py | 2 +- hermes_cli/service_manager.py | 19 ++++++++++---- tests/docker/test_container_restart.py | 3 ++- tests/hermes_cli/test_profiles_s6_hooks.py | 17 +++++++++++++ tests/hermes_cli/test_service_manager.py | 29 ++++++++++++++++++++++ 5 files changed, 63 insertions(+), 7 deletions(-) diff --git a/hermes_cli/profiles.py b/hermes_cli/profiles.py index ea1212de621..881dd481445 100644 --- a/hermes_cli/profiles.py +++ b/hermes_cli/profiles.py @@ -1268,7 +1268,7 @@ def _maybe_register_gateway_service(profile_name: str) -> None: if not mgr.supports_runtime_registration(): return # host backend; no-op try: - mgr.register_profile_gateway(profile_name) + mgr.register_profile_gateway(profile_name, start_now=False) except ValueError: # Already registered (e.g. the container-boot reconciler ran # first and brought up a stale slot). That's fine. diff --git a/hermes_cli/service_manager.py b/hermes_cli/service_manager.py index 79c833ab6a0..129e66e506f 100644 --- a/hermes_cli/service_manager.py +++ b/hermes_cli/service_manager.py @@ -77,6 +77,7 @@ class ServiceManager(Protocol): profile: str, *, extra_env: dict[str, str] | None = None, + start_now: bool = True, ) -> None: ... def unregister_profile_gateway(self, profile: str) -> None: ... def list_profile_gateways(self) -> list[str]: ... @@ -182,6 +183,7 @@ class _RegistrationUnsupportedMixin: profile: str, *, extra_env: dict[str, str] | None = None, + start_now: bool = True, ) -> None: raise NotImplementedError( f"{type(self).__name__} does not support runtime profile " @@ -830,15 +832,15 @@ class S6ServiceManager: profile: str, *, extra_env: dict[str, str] | None = None, + start_now: bool = True, ) -> None: """Create the s6 service directory for a profile gateway. Triggers ``s6-svscanctl -a`` so s6-svscan picks the new directory - up immediately. The service is created in the *up* state — to - register without auto-starting, follow up with ``stop(profile)`` - (or pass the start flag via the future ``start_now=False`` arg, - which the Phase 4 reconciliation path uses via a ``down`` - marker file written directly). + up immediately. When *start_now* is ``True`` (the default) the + service starts immediately; when ``False`` a ``down`` marker file + is written so s6-supervise leaves the service stopped until the + user explicitly runs ``hermes -p gateway start``. Raises: ValueError: if the profile name is invalid or the service @@ -886,6 +888,13 @@ class S6ServiceManager: # rationale. _seed_supervise_skeleton(tmp_dir) + # When start_now is False, write a `down` marker so + # s6-supervise does not auto-start the service on rescan. + # Mirrors the same pattern in container_boot.py + # _register_gateway_slot when start=False. + if not start_now: + (tmp_dir / "down").touch() + tmp_dir.rename(svc_dir) except Exception: shutil.rmtree(tmp_dir, ignore_errors=True) diff --git a/tests/docker/test_container_restart.py b/tests/docker/test_container_restart.py index 2ad00ef294a..ca08c127f8b 100644 --- a/tests/docker/test_container_restart.py +++ b/tests/docker/test_container_restart.py @@ -296,7 +296,8 @@ def test_live_gateway_autostarts_after_real_restart_without_manual_state_stamp( ) if r.returncode == 0 and '"gateway_state"' in r.stdout: state = r.stdout - break + if '"running"' in state: + break time.sleep(0.5) assert '"running"' in state, ( f"gateway never persisted running state pre-restart: {state!r}" diff --git a/tests/hermes_cli/test_profiles_s6_hooks.py b/tests/hermes_cli/test_profiles_s6_hooks.py index db50debdcba..c5d3ccbf6a9 100644 --- a/tests/hermes_cli/test_profiles_s6_hooks.py +++ b/tests/hermes_cli/test_profiles_s6_hooks.py @@ -54,10 +54,12 @@ class _S6Manager: def register_profile_gateway( self, profile: str, *, extra_env: dict[str, str] | None = None, + start_now: bool = True, ) -> None: if self.raise_on_register is not None: raise self.raise_on_register self.registered.append(profile) + self.last_start_now = start_now def unregister_profile_gateway(self, profile: str) -> None: if self.raise_on_unregister is not None: @@ -107,6 +109,21 @@ def test_register_calls_through_on_s6(monkeypatch: pytest.MonkeyPatch) -> None: assert mgr.registered == ["coder"] +def test_register_passes_start_now_false(monkeypatch: pytest.MonkeyPatch) -> None: + """_maybe_register_gateway_service must register with start_now=False + so that profile creation does not auto-start a gateway that may + conflict with the main gateway's bot-token lock.""" + _patch_detect_s6(monkeypatch) + mgr = _S6Manager() + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: mgr, + ) + _maybe_register_gateway_service("coder") + assert mgr.last_start_now is False, ( + "profile creation must not auto-start the gateway service" + ) + + def test_register_swallows_duplicate_value_error( monkeypatch: pytest.MonkeyPatch, ) -> None: diff --git a/tests/hermes_cli/test_service_manager.py b/tests/hermes_cli/test_service_manager.py index 3bbb0a66227..bf157ef25b6 100644 --- a/tests/hermes_cli/test_service_manager.py +++ b/tests/hermes_cli/test_service_manager.py @@ -586,6 +586,35 @@ def test_s6_register_creates_service_dir_and_triggers_scan( ), f"s6-svscanctl -a not invoked; saw: {fake_subprocess_run}" +def test_s6_register_start_now_false_writes_down_marker( + s6_scandir, fake_subprocess_run, +) -> None: + """When start_now=False, a `down` marker must be written so + s6-supervise does not auto-start the service on rescan.""" + mgr = S6ServiceManager(scandir=s6_scandir) + mgr.register_profile_gateway("coder", start_now=False) + + svc_dir = s6_scandir / "gateway-coder" + assert svc_dir.is_dir() + assert (svc_dir / "down").is_file(), ( + "start_now=False must write a `down` marker file" + ) + + +def test_s6_register_start_now_true_no_down_marker( + s6_scandir, fake_subprocess_run, +) -> None: + """When start_now=True (default), no `down` marker should exist.""" + mgr = S6ServiceManager(scandir=s6_scandir) + mgr.register_profile_gateway("coder") + + svc_dir = s6_scandir / "gateway-coder" + assert svc_dir.is_dir() + assert not (svc_dir / "down").exists(), ( + "start_now=True must NOT write a `down` marker file" + ) + + def test_s6_register_extra_env_is_quoted(s6_scandir, fake_subprocess_run) -> None: mgr = S6ServiceManager(scandir=s6_scandir) mgr.register_profile_gateway(