diff --git a/hermes_cli/container_boot.py b/hermes_cli/container_boot.py index d1a7ccd7d44..647545dd5da 100644 --- a/hermes_cli/container_boot.py +++ b/hermes_cli/container_boot.py @@ -207,8 +207,15 @@ def _read_container_argv() -> tuple[str, ...]: return tuple(part.decode("utf-8", "replace") for part in raw.split(b"\0") if part) -def _is_legacy_gateway_run_request(argv: Sequence[str]) -> bool: - """Return True for Docker commands equivalent to `gateway run`.""" +def _strip_container_argv_prefix(argv: Sequence[str]) -> list[str]: + """Strip the s6/wrapper prefix off PID 1 argv, leaving the hermes args. + + The container PID 1 argv looks like + ``/init /opt/hermes/docker/main-wrapper.sh [args...]`` and + the wrapper re-execs ``hermes ``. Peel ``init`` → + ``main-wrapper.sh`` → ``hermes`` so callers can match on the bare + subcommand. Shared by the legacy-gateway and dashboard role detectors. + """ args = list(argv) if args and Path(args[0]).name == "init": args = args[1:] @@ -216,11 +223,38 @@ def _is_legacy_gateway_run_request(argv: Sequence[str]) -> bool: args = args[1:] if args and Path(args[0]).name == "hermes": args = args[1:] + return args + + +def _is_legacy_gateway_run_request(argv: Sequence[str]) -> bool: + """Return True for Docker commands equivalent to `gateway run`.""" + args = _strip_container_argv_prefix(argv) if "--no-supervise" in args: return False return len(args) >= 2 and args[0] == "gateway" and args[1] == "run" +def _is_dashboard_container(argv: Sequence[str]) -> bool: + """Return True when the container's command is the dashboard. + + A dashboard-only container (``hermes dashboard ...``) never spawns or + supervises per-profile gateways — that is the gateway container's job. + Reconciling profile gateway s6 slots there is not just wasted work: when + the gateway and dashboard containers share a bind-mounted HERMES_HOME, + both race to ``flock()`` the same ``logs/gateways//lock`` files, + producing "Resource busy" failures and an s6-log restart storm. So the + dashboard container skips reconciliation entirely. + + Detected from PID 1 argv (``/proc/1/cmdline``) rather than an operator + flag: the role is a fact about the container's command, not a tunable, + and a flag can be forgotten in a hand-written compose/k8s manifest — + reintroducing the exact storm this prevents. Mirrors the argv handling + in :func:`_is_legacy_gateway_run_request`. + """ + args = _strip_container_argv_prefix(argv) + return bool(args) and args[0] == "dashboard" + + def _read_desired_state(profile_dir: Path) -> str | None: """Read the persisted gateway desired state for reconciliation. @@ -393,6 +427,22 @@ _LOG_ROTATE_BYTES = 256 * 1024 def main() -> int: """Entry point invoked from /etc/cont-init.d/02-reconcile-profiles.""" + # A dashboard-only container never spawns or supervises per-profile + # gateways, so reconciling their s6 slots here is pure waste — and + # actively harmful: when the gateway and dashboard containers share a + # bind-mounted HERMES_HOME, both race to flock() the same s6-log lock + # files under logs/gateways//lock, producing "Resource busy" + # failures and a restart storm. Detect the role from PID 1 argv and + # skip reconciliation in the dashboard container. No operator flag: + # the role is a fact about the container's command, and a flag can be + # forgotten in a hand-written manifest, reintroducing the storm. + if _is_dashboard_container(_read_container_argv()): + print( + "reconcile: skipping (dashboard container — does not need " + "per-profile gateways)" + ) + return 0 + hermes_home = Path(os.environ.get("HERMES_HOME", "/opt/data")) scandir = Path(os.environ.get("S6_PROFILE_GATEWAY_SCANDIR", "/run/service")) actions = reconcile_profile_gateways( diff --git a/scripts/release.py b/scripts/release.py index 63fb97a49c5..c5b35977332 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -84,6 +84,7 @@ AUTHOR_MAP = { "290859878+synapsesx@users.noreply.github.com": "synapsesx", "157689911+itsflownium@users.noreply.github.com": "itsflownium", "dirtyren@users.noreply.github.com": "dirtyren", + "895252509@qq.com": "895252509", "35259607+zxcasongs@users.noreply.github.com": "zxcasongs", "alfred@my-cloud.me": "alfred-smith-0", "tangtaizhong792@gmail.com": "tangtaizong666", diff --git a/tests/hermes_cli/test_container_boot.py b/tests/hermes_cli/test_container_boot.py index db43ff90f1c..a86321a6887 100644 --- a/tests/hermes_cli/test_container_boot.py +++ b/tests/hermes_cli/test_container_boot.py @@ -708,3 +708,144 @@ def test_profiles_default_subdir_is_skipped_with_warning( assert any( "profiles/default/" in record.message for record in caplog.records ) + + +# --------------------------------------------------------------------------- +# Dashboard-container role detection (skip reconcile on the dashboard) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "container_argv", + [ + # Bare subcommand (docker run ... dashboard ...). + ("dashboard",), + ("dashboard", "--host", "127.0.0.1", "--no-open"), + # Through s6 /init + the main-wrapper that re-execs `hermes`. + ("/init", "/opt/hermes/docker/main-wrapper.sh", "dashboard"), + ( + "/init", + "/opt/hermes/docker/main-wrapper.sh", + "dashboard", + "--host", + "127.0.0.1", + "--no-open", + ), + # Wrapper that kept the explicit `hermes` argv0. + ("/init", "/opt/hermes/docker/main-wrapper.sh", "hermes", "dashboard"), + ], +) +def test_is_dashboard_container_true_for_dashboard_argv( + container_argv: tuple[str, ...], +) -> None: + """A dashboard command is detected across every wrapper prefix shape.""" + from hermes_cli.container_boot import _is_dashboard_container + + assert _is_dashboard_container(container_argv) is True + + +@pytest.mark.parametrize( + "container_argv", + [ + (), # empty (/proc/1/cmdline unreadable) — not the dashboard + ("gateway", "run"), + ("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run"), + ("/init", "/opt/hermes/docker/main-wrapper.sh", "hermes", "gateway", "run"), + ("chat",), + # A profile literally named "dashboard" must NOT match — the token + # we key on is the SUBCOMMAND, and `gateway run -p dashboard` is a + # gateway container. + ("gateway", "run", "-p", "dashboard"), + ], +) +def test_is_dashboard_container_false_for_non_dashboard_argv( + container_argv: tuple[str, ...], +) -> None: + """Gateway / other commands (and empty argv) are not the dashboard.""" + from hermes_cli.container_boot import _is_dashboard_container + + assert _is_dashboard_container(container_argv) is False + + +def test_main_skips_reconcile_in_dashboard_container( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """main() must NOT reconcile when PID 1 argv is the dashboard command. + + A running profile is seeded so that, if reconcile ran, it would create + the gateway- slot. Asserting the slot is absent proves the + skip is real, not just a log line. + """ + from hermes_cli import container_boot + + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "worker", state="running") + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + monkeypatch.setenv("S6_PROFILE_GATEWAY_SCANDIR", str(scandir)) + monkeypatch.setattr( + container_boot, + "_read_container_argv", + lambda: ("/init", "/opt/hermes/docker/main-wrapper.sh", "dashboard"), + ) + + rc = container_boot.main() + + assert rc == 0 + assert not (scandir / "gateway-worker").exists() + assert not (scandir / "gateway-default").exists() + assert "skipping (dashboard container" in capsys.readouterr().out + + +def test_main_reconciles_in_gateway_container( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """main() reconciles normally when PID 1 argv is the gateway command — + the dashboard skip is scoped strictly to the dashboard role.""" + from hermes_cli import container_boot + + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "worker", state="running") + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + monkeypatch.setenv("S6_PROFILE_GATEWAY_SCANDIR", str(scandir)) + monkeypatch.setattr( + container_boot, + "_read_container_argv", + lambda: ("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run"), + ) + + rc = container_boot.main() + + assert rc == 0 + # The worker slot was registered + started (prior_state running). + assert (scandir / "gateway-worker").exists() + assert not (scandir / "gateway-worker" / "down").exists() + + +def test_main_ignores_removed_skip_reconcile_env_var( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """The legacy HERMES_SKIP_PROFILE_RECONCILE flag is gone: setting it on a + gateway container must NOT suppress reconciliation. Role is decided by + PID 1 argv alone, so a stale flag in someone's manifest is inert.""" + from hermes_cli import container_boot + + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "worker", state="running") + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + monkeypatch.setenv("S6_PROFILE_GATEWAY_SCANDIR", str(scandir)) + monkeypatch.setenv("HERMES_SKIP_PROFILE_RECONCILE", "1") + monkeypatch.setattr( + container_boot, + "_read_container_argv", + lambda: ("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run"), + ) + + rc = container_boot.main() + + assert rc == 0 + # Reconcile still ran despite the stale env var. + assert (scandir / "gateway-worker").exists()