mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-16 09:31:37 +00:00
fix(s6): register profile gateways without auto-starting (#46266)
* fix(s6): prevent profile create from auto-starting gateway service When hermes profile create runs inside an s6 container, _maybe_register_gateway_service() calls register_profile_gateway() which creates the service directory and triggers s6-svscanctl -a. Previously the service always started immediately, causing profiles that share the main gateway's bot token (e.g. Kanban worker profiles) to fail with a token-lock conflict and persist gateway_state: running — becoming zombies that resurrect on every container restart. Wire the existing start_now parameter through the S6 implementation: when start_now=False, write a marker file (same pattern as container_boot.py _register_gateway_slot) so s6-supervise leaves the service stopped until the user explicitly runs hermes -p <profile> gateway start. 4 files, +61/-6, 4 new tests (all passing). * test(docker): wait for gateway running state before restart --------- Co-authored-by: liuhao1024 <sunsky.lau@gmail.com>
This commit is contained in:
parent
4eb0ff639b
commit
40d7c264f0
5 changed files with 63 additions and 7 deletions
|
|
@ -1268,7 +1268,7 @@ def _maybe_register_gateway_service(profile_name: str) -> None:
|
|||
if not mgr.supports_runtime_registration():
|
||||
return # host backend; no-op
|
||||
try:
|
||||
mgr.register_profile_gateway(profile_name)
|
||||
mgr.register_profile_gateway(profile_name, start_now=False)
|
||||
except ValueError:
|
||||
# Already registered (e.g. the container-boot reconciler ran
|
||||
# first and brought up a stale slot). That's fine.
|
||||
|
|
|
|||
|
|
@ -77,6 +77,7 @@ class ServiceManager(Protocol):
|
|||
profile: str,
|
||||
*,
|
||||
extra_env: dict[str, str] | None = None,
|
||||
start_now: bool = True,
|
||||
) -> None: ...
|
||||
def unregister_profile_gateway(self, profile: str) -> None: ...
|
||||
def list_profile_gateways(self) -> list[str]: ...
|
||||
|
|
@ -182,6 +183,7 @@ class _RegistrationUnsupportedMixin:
|
|||
profile: str,
|
||||
*,
|
||||
extra_env: dict[str, str] | None = None,
|
||||
start_now: bool = True,
|
||||
) -> None:
|
||||
raise NotImplementedError(
|
||||
f"{type(self).__name__} does not support runtime profile "
|
||||
|
|
@ -830,15 +832,15 @@ class S6ServiceManager:
|
|||
profile: str,
|
||||
*,
|
||||
extra_env: dict[str, str] | None = None,
|
||||
start_now: bool = True,
|
||||
) -> None:
|
||||
"""Create the s6 service directory for a profile gateway.
|
||||
|
||||
Triggers ``s6-svscanctl -a`` so s6-svscan picks the new directory
|
||||
up immediately. The service is created in the *up* state — to
|
||||
register without auto-starting, follow up with ``stop(profile)``
|
||||
(or pass the start flag via the future ``start_now=False`` arg,
|
||||
which the Phase 4 reconciliation path uses via a ``down``
|
||||
marker file written directly).
|
||||
up immediately. When *start_now* is ``True`` (the default) the
|
||||
service starts immediately; when ``False`` a ``down`` marker file
|
||||
is written so s6-supervise leaves the service stopped until the
|
||||
user explicitly runs ``hermes -p <profile> gateway start``.
|
||||
|
||||
Raises:
|
||||
ValueError: if the profile name is invalid or the service
|
||||
|
|
@ -886,6 +888,13 @@ class S6ServiceManager:
|
|||
# rationale.
|
||||
_seed_supervise_skeleton(tmp_dir)
|
||||
|
||||
# When start_now is False, write a `down` marker so
|
||||
# s6-supervise does not auto-start the service on rescan.
|
||||
# Mirrors the same pattern in container_boot.py
|
||||
# _register_gateway_slot when start=False.
|
||||
if not start_now:
|
||||
(tmp_dir / "down").touch()
|
||||
|
||||
tmp_dir.rename(svc_dir)
|
||||
except Exception:
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
|
|
|
|||
|
|
@ -296,7 +296,8 @@ def test_live_gateway_autostarts_after_real_restart_without_manual_state_stamp(
|
|||
)
|
||||
if r.returncode == 0 and '"gateway_state"' in r.stdout:
|
||||
state = r.stdout
|
||||
break
|
||||
if '"running"' in state:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
assert '"running"' in state, (
|
||||
f"gateway never persisted running state pre-restart: {state!r}"
|
||||
|
|
|
|||
|
|
@ -54,10 +54,12 @@ class _S6Manager:
|
|||
def register_profile_gateway(
|
||||
self, profile: str, *,
|
||||
extra_env: dict[str, str] | None = None,
|
||||
start_now: bool = True,
|
||||
) -> None:
|
||||
if self.raise_on_register is not None:
|
||||
raise self.raise_on_register
|
||||
self.registered.append(profile)
|
||||
self.last_start_now = start_now
|
||||
|
||||
def unregister_profile_gateway(self, profile: str) -> None:
|
||||
if self.raise_on_unregister is not None:
|
||||
|
|
@ -107,6 +109,21 @@ def test_register_calls_through_on_s6(monkeypatch: pytest.MonkeyPatch) -> None:
|
|||
assert mgr.registered == ["coder"]
|
||||
|
||||
|
||||
def test_register_passes_start_now_false(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""_maybe_register_gateway_service must register with start_now=False
|
||||
so that profile creation does not auto-start a gateway that may
|
||||
conflict with the main gateway's bot-token lock."""
|
||||
_patch_detect_s6(monkeypatch)
|
||||
mgr = _S6Manager()
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.get_service_manager", lambda: mgr,
|
||||
)
|
||||
_maybe_register_gateway_service("coder")
|
||||
assert mgr.last_start_now is False, (
|
||||
"profile creation must not auto-start the gateway service"
|
||||
)
|
||||
|
||||
|
||||
def test_register_swallows_duplicate_value_error(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
|
|
|
|||
|
|
@ -586,6 +586,35 @@ def test_s6_register_creates_service_dir_and_triggers_scan(
|
|||
), f"s6-svscanctl -a not invoked; saw: {fake_subprocess_run}"
|
||||
|
||||
|
||||
def test_s6_register_start_now_false_writes_down_marker(
|
||||
s6_scandir, fake_subprocess_run,
|
||||
) -> None:
|
||||
"""When start_now=False, a `down` marker must be written so
|
||||
s6-supervise does not auto-start the service on rescan."""
|
||||
mgr = S6ServiceManager(scandir=s6_scandir)
|
||||
mgr.register_profile_gateway("coder", start_now=False)
|
||||
|
||||
svc_dir = s6_scandir / "gateway-coder"
|
||||
assert svc_dir.is_dir()
|
||||
assert (svc_dir / "down").is_file(), (
|
||||
"start_now=False must write a `down` marker file"
|
||||
)
|
||||
|
||||
|
||||
def test_s6_register_start_now_true_no_down_marker(
|
||||
s6_scandir, fake_subprocess_run,
|
||||
) -> None:
|
||||
"""When start_now=True (default), no `down` marker should exist."""
|
||||
mgr = S6ServiceManager(scandir=s6_scandir)
|
||||
mgr.register_profile_gateway("coder")
|
||||
|
||||
svc_dir = s6_scandir / "gateway-coder"
|
||||
assert svc_dir.is_dir()
|
||||
assert not (svc_dir / "down").exists(), (
|
||||
"start_now=True must NOT write a `down` marker file"
|
||||
)
|
||||
|
||||
|
||||
def test_s6_register_extra_env_is_quoted(s6_scandir, fake_subprocess_run) -> None:
|
||||
mgr = S6ServiceManager(scandir=s6_scandir)
|
||||
mgr.register_profile_gateway(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue