fix(s6): register profile gateways without auto-starting (#46266)

* fix(s6): prevent profile create from auto-starting gateway service

When hermes profile create runs inside an s6 container,
_maybe_register_gateway_service() calls register_profile_gateway()
which creates the service directory and triggers s6-svscanctl -a.
Previously the service always started immediately, causing profiles
that share the main gateway's bot token (e.g. Kanban worker profiles)
to fail with a token-lock conflict and persist gateway_state: running
— becoming zombies that resurrect on every container restart.

Wire the existing start_now parameter through the S6 implementation:
when start_now=False, write a  marker file (same pattern as
container_boot.py _register_gateway_slot) so s6-supervise leaves the
service stopped until the user explicitly runs hermes -p <profile>
gateway start.

4 files, +61/-6, 4 new tests (all passing).

* test(docker): wait for gateway running state before restart

---------

Co-authored-by: liuhao1024 <sunsky.lau@gmail.com>
This commit is contained in:
Teknium 2026-06-14 18:43:23 -07:00 committed by GitHub
parent 4eb0ff639b
commit 40d7c264f0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 63 additions and 7 deletions

View file

@ -1268,7 +1268,7 @@ def _maybe_register_gateway_service(profile_name: str) -> None:
if not mgr.supports_runtime_registration():
return # host backend; no-op
try:
mgr.register_profile_gateway(profile_name)
mgr.register_profile_gateway(profile_name, start_now=False)
except ValueError:
# Already registered (e.g. the container-boot reconciler ran
# first and brought up a stale slot). That's fine.

View file

@ -77,6 +77,7 @@ class ServiceManager(Protocol):
profile: str,
*,
extra_env: dict[str, str] | None = None,
start_now: bool = True,
) -> None: ...
def unregister_profile_gateway(self, profile: str) -> None: ...
def list_profile_gateways(self) -> list[str]: ...
@ -182,6 +183,7 @@ class _RegistrationUnsupportedMixin:
profile: str,
*,
extra_env: dict[str, str] | None = None,
start_now: bool = True,
) -> None:
raise NotImplementedError(
f"{type(self).__name__} does not support runtime profile "
@ -830,15 +832,15 @@ class S6ServiceManager:
profile: str,
*,
extra_env: dict[str, str] | None = None,
start_now: bool = True,
) -> None:
"""Create the s6 service directory for a profile gateway.
Triggers ``s6-svscanctl -a`` so s6-svscan picks the new directory
up immediately. The service is created in the *up* state to
register without auto-starting, follow up with ``stop(profile)``
(or pass the start flag via the future ``start_now=False`` arg,
which the Phase 4 reconciliation path uses via a ``down``
marker file written directly).
up immediately. When *start_now* is ``True`` (the default) the
service starts immediately; when ``False`` a ``down`` marker file
is written so s6-supervise leaves the service stopped until the
user explicitly runs ``hermes -p <profile> gateway start``.
Raises:
ValueError: if the profile name is invalid or the service
@ -886,6 +888,13 @@ class S6ServiceManager:
# rationale.
_seed_supervise_skeleton(tmp_dir)
# When start_now is False, write a `down` marker so
# s6-supervise does not auto-start the service on rescan.
# Mirrors the same pattern in container_boot.py
# _register_gateway_slot when start=False.
if not start_now:
(tmp_dir / "down").touch()
tmp_dir.rename(svc_dir)
except Exception:
shutil.rmtree(tmp_dir, ignore_errors=True)

View file

@ -296,7 +296,8 @@ def test_live_gateway_autostarts_after_real_restart_without_manual_state_stamp(
)
if r.returncode == 0 and '"gateway_state"' in r.stdout:
state = r.stdout
break
if '"running"' in state:
break
time.sleep(0.5)
assert '"running"' in state, (
f"gateway never persisted running state pre-restart: {state!r}"

View file

@ -54,10 +54,12 @@ class _S6Manager:
def register_profile_gateway(
self, profile: str, *,
extra_env: dict[str, str] | None = None,
start_now: bool = True,
) -> None:
if self.raise_on_register is not None:
raise self.raise_on_register
self.registered.append(profile)
self.last_start_now = start_now
def unregister_profile_gateway(self, profile: str) -> None:
if self.raise_on_unregister is not None:
@ -107,6 +109,21 @@ def test_register_calls_through_on_s6(monkeypatch: pytest.MonkeyPatch) -> None:
assert mgr.registered == ["coder"]
def test_register_passes_start_now_false(monkeypatch: pytest.MonkeyPatch) -> None:
"""_maybe_register_gateway_service must register with start_now=False
so that profile creation does not auto-start a gateway that may
conflict with the main gateway's bot-token lock."""
_patch_detect_s6(monkeypatch)
mgr = _S6Manager()
monkeypatch.setattr(
"hermes_cli.service_manager.get_service_manager", lambda: mgr,
)
_maybe_register_gateway_service("coder")
assert mgr.last_start_now is False, (
"profile creation must not auto-start the gateway service"
)
def test_register_swallows_duplicate_value_error(
monkeypatch: pytest.MonkeyPatch,
) -> None:

View file

@ -586,6 +586,35 @@ def test_s6_register_creates_service_dir_and_triggers_scan(
), f"s6-svscanctl -a not invoked; saw: {fake_subprocess_run}"
def test_s6_register_start_now_false_writes_down_marker(
s6_scandir, fake_subprocess_run,
) -> None:
"""When start_now=False, a `down` marker must be written so
s6-supervise does not auto-start the service on rescan."""
mgr = S6ServiceManager(scandir=s6_scandir)
mgr.register_profile_gateway("coder", start_now=False)
svc_dir = s6_scandir / "gateway-coder"
assert svc_dir.is_dir()
assert (svc_dir / "down").is_file(), (
"start_now=False must write a `down` marker file"
)
def test_s6_register_start_now_true_no_down_marker(
s6_scandir, fake_subprocess_run,
) -> None:
"""When start_now=True (default), no `down` marker should exist."""
mgr = S6ServiceManager(scandir=s6_scandir)
mgr.register_profile_gateway("coder")
svc_dir = s6_scandir / "gateway-coder"
assert svc_dir.is_dir()
assert not (svc_dir / "down").exists(), (
"start_now=True must NOT write a `down` marker file"
)
def test_s6_register_extra_env_is_quoted(s6_scandir, fake_subprocess_run) -> None:
mgr = S6ServiceManager(scandir=s6_scandir)
mgr.register_profile_gateway(