fix(docker): skip gateway reconciliation in dashboard container (autodetect) (#46293)

* fix(docker): skip per-profile gateway reconciliation in dashboard container

When gateway and dashboard containers share a bind-mounted HERMES_HOME,
both run the cont-init.d profile reconciliation script, which creates
s6-log processes for every persisted profile.  These s6-log processes
in different containers race to flock() the same log-directory lock
files under logs/gateways/<profile>/lock, producing repeated
"s6-log: fatal: unable to lock ... Resource busy" errors and a
supervision restart storm.

Add HERMES_SKIP_PROFILE_RECONCILE env var support to container_boot.py
and set it in the official docker-compose.yml dashboard service so the
dashboard container no longer creates per-profile gateway s6 services
it never uses.

* chore(release): map salvaged contributor

* refactor(docker): autodetect dashboard container instead of env-var gate

Replace the HERMES_SKIP_PROFILE_RECONCILE env var with PID 1 argv role
detection. A dashboard-only container never spawns or supervises
per-profile gateways, so the reconcile boot hook now skips itself when
/proc/1/cmdline is the dashboard command — no operator flag to set (or
forget in a hand-written manifest, which would reintroduce the s6-log
flock storm this prevents).

- Extract _strip_container_argv_prefix() shared by the legacy-gateway
  and new dashboard detectors (DRY the init/wrapper/hermes peel).
- Add _is_dashboard_container(); gate reconcile main() on it.
- Drop HERMES_SKIP_PROFILE_RECONCILE from code + docker-compose.yml.
- Tests: argv matrix for both roles + main()-level skip/reconcile proof
  and a regression that the removed env var is now inert.

Co-authored-by: 895252509 <895252509@qq.com>

---------

Co-authored-by: zhouxiang <895252509@qq.com>
Co-authored-by: Ben <ben@nousresearch.com>
This commit is contained in:
Teknium 2026-06-15 03:51:48 -07:00 committed by GitHub
parent 6cb88a0874
commit aca11c227e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 194 additions and 2 deletions

View file

@ -207,8 +207,15 @@ def _read_container_argv() -> tuple[str, ...]:
return tuple(part.decode("utf-8", "replace") for part in raw.split(b"\0") if part)
def _is_legacy_gateway_run_request(argv: Sequence[str]) -> bool:
"""Return True for Docker commands equivalent to `gateway run`."""
def _strip_container_argv_prefix(argv: Sequence[str]) -> list[str]:
"""Strip the s6/wrapper prefix off PID 1 argv, leaving the hermes args.
The container PID 1 argv looks like
``/init /opt/hermes/docker/main-wrapper.sh <subcommand> [args...]`` and
the wrapper re-execs ``hermes <subcommand>``. Peel ``init``
``main-wrapper.sh`` ``hermes`` so callers can match on the bare
subcommand. Shared by the legacy-gateway and dashboard role detectors.
"""
args = list(argv)
if args and Path(args[0]).name == "init":
args = args[1:]
@ -216,11 +223,38 @@ def _is_legacy_gateway_run_request(argv: Sequence[str]) -> bool:
args = args[1:]
if args and Path(args[0]).name == "hermes":
args = args[1:]
return args
def _is_legacy_gateway_run_request(argv: Sequence[str]) -> bool:
"""Return True for Docker commands equivalent to `gateway run`."""
args = _strip_container_argv_prefix(argv)
if "--no-supervise" in args:
return False
return len(args) >= 2 and args[0] == "gateway" and args[1] == "run"
def _is_dashboard_container(argv: Sequence[str]) -> bool:
"""Return True when the container's command is the dashboard.
A dashboard-only container (``hermes dashboard ...``) never spawns or
supervises per-profile gateways that is the gateway container's job.
Reconciling profile gateway s6 slots there is not just wasted work: when
the gateway and dashboard containers share a bind-mounted HERMES_HOME,
both race to ``flock()`` the same ``logs/gateways/<profile>/lock`` files,
producing "Resource busy" failures and an s6-log restart storm. So the
dashboard container skips reconciliation entirely.
Detected from PID 1 argv (``/proc/1/cmdline``) rather than an operator
flag: the role is a fact about the container's command, not a tunable,
and a flag can be forgotten in a hand-written compose/k8s manifest
reintroducing the exact storm this prevents. Mirrors the argv handling
in :func:`_is_legacy_gateway_run_request`.
"""
args = _strip_container_argv_prefix(argv)
return bool(args) and args[0] == "dashboard"
def _read_desired_state(profile_dir: Path) -> str | None:
"""Read the persisted gateway desired state for reconciliation.
@ -393,6 +427,22 @@ _LOG_ROTATE_BYTES = 256 * 1024
def main() -> int:
"""Entry point invoked from /etc/cont-init.d/02-reconcile-profiles."""
# A dashboard-only container never spawns or supervises per-profile
# gateways, so reconciling their s6 slots here is pure waste — and
# actively harmful: when the gateway and dashboard containers share a
# bind-mounted HERMES_HOME, both race to flock() the same s6-log lock
# files under logs/gateways/<profile>/lock, producing "Resource busy"
# failures and a restart storm. Detect the role from PID 1 argv and
# skip reconciliation in the dashboard container. No operator flag:
# the role is a fact about the container's command, and a flag can be
# forgotten in a hand-written manifest, reintroducing the storm.
if _is_dashboard_container(_read_container_argv()):
print(
"reconcile: skipping (dashboard container — does not need "
"per-profile gateways)"
)
return 0
hermes_home = Path(os.environ.get("HERMES_HOME", "/opt/data"))
scandir = Path(os.environ.get("S6_PROFILE_GATEWAY_SCANDIR", "/run/service"))
actions = reconcile_profile_gateways(

View file

@ -84,6 +84,7 @@ AUTHOR_MAP = {
"290859878+synapsesx@users.noreply.github.com": "synapsesx",
"157689911+itsflownium@users.noreply.github.com": "itsflownium",
"dirtyren@users.noreply.github.com": "dirtyren",
"895252509@qq.com": "895252509",
"35259607+zxcasongs@users.noreply.github.com": "zxcasongs",
"alfred@my-cloud.me": "alfred-smith-0",
"tangtaizhong792@gmail.com": "tangtaizong666",

View file

@ -708,3 +708,144 @@ def test_profiles_default_subdir_is_skipped_with_warning(
assert any(
"profiles/default/" in record.message for record in caplog.records
)
# ---------------------------------------------------------------------------
# Dashboard-container role detection (skip reconcile on the dashboard)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize(
"container_argv",
[
# Bare subcommand (docker run ... dashboard ...).
("dashboard",),
("dashboard", "--host", "127.0.0.1", "--no-open"),
# Through s6 /init + the main-wrapper that re-execs `hermes`.
("/init", "/opt/hermes/docker/main-wrapper.sh", "dashboard"),
(
"/init",
"/opt/hermes/docker/main-wrapper.sh",
"dashboard",
"--host",
"127.0.0.1",
"--no-open",
),
# Wrapper that kept the explicit `hermes` argv0.
("/init", "/opt/hermes/docker/main-wrapper.sh", "hermes", "dashboard"),
],
)
def test_is_dashboard_container_true_for_dashboard_argv(
container_argv: tuple[str, ...],
) -> None:
"""A dashboard command is detected across every wrapper prefix shape."""
from hermes_cli.container_boot import _is_dashboard_container
assert _is_dashboard_container(container_argv) is True
@pytest.mark.parametrize(
"container_argv",
[
(), # empty (/proc/1/cmdline unreadable) — not the dashboard
("gateway", "run"),
("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run"),
("/init", "/opt/hermes/docker/main-wrapper.sh", "hermes", "gateway", "run"),
("chat",),
# A profile literally named "dashboard" must NOT match — the token
# we key on is the SUBCOMMAND, and `gateway run -p dashboard` is a
# gateway container.
("gateway", "run", "-p", "dashboard"),
],
)
def test_is_dashboard_container_false_for_non_dashboard_argv(
container_argv: tuple[str, ...],
) -> None:
"""Gateway / other commands (and empty argv) are not the dashboard."""
from hermes_cli.container_boot import _is_dashboard_container
assert _is_dashboard_container(container_argv) is False
def test_main_skips_reconcile_in_dashboard_container(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
capsys: pytest.CaptureFixture[str],
) -> None:
"""main() must NOT reconcile when PID 1 argv is the dashboard command.
A running profile is seeded so that, if reconcile ran, it would create
the gateway-<profile> slot. Asserting the slot is absent proves the
skip is real, not just a log line.
"""
from hermes_cli import container_boot
scandir = tmp_path / "run-service"; scandir.mkdir()
_make_profile(tmp_path, "worker", state="running")
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setenv("S6_PROFILE_GATEWAY_SCANDIR", str(scandir))
monkeypatch.setattr(
container_boot,
"_read_container_argv",
lambda: ("/init", "/opt/hermes/docker/main-wrapper.sh", "dashboard"),
)
rc = container_boot.main()
assert rc == 0
assert not (scandir / "gateway-worker").exists()
assert not (scandir / "gateway-default").exists()
assert "skipping (dashboard container" in capsys.readouterr().out
def test_main_reconciles_in_gateway_container(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""main() reconciles normally when PID 1 argv is the gateway command —
the dashboard skip is scoped strictly to the dashboard role."""
from hermes_cli import container_boot
scandir = tmp_path / "run-service"; scandir.mkdir()
_make_profile(tmp_path, "worker", state="running")
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setenv("S6_PROFILE_GATEWAY_SCANDIR", str(scandir))
monkeypatch.setattr(
container_boot,
"_read_container_argv",
lambda: ("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run"),
)
rc = container_boot.main()
assert rc == 0
# The worker slot was registered + started (prior_state running).
assert (scandir / "gateway-worker").exists()
assert not (scandir / "gateway-worker" / "down").exists()
def test_main_ignores_removed_skip_reconcile_env_var(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""The legacy HERMES_SKIP_PROFILE_RECONCILE flag is gone: setting it on a
gateway container must NOT suppress reconciliation. Role is decided by
PID 1 argv alone, so a stale flag in someone's manifest is inert."""
from hermes_cli import container_boot
scandir = tmp_path / "run-service"; scandir.mkdir()
_make_profile(tmp_path, "worker", state="running")
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setenv("S6_PROFILE_GATEWAY_SCANDIR", str(scandir))
monkeypatch.setenv("HERMES_SKIP_PROFILE_RECONCILE", "1")
monkeypatch.setattr(
container_boot,
"_read_container_argv",
lambda: ("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run"),
)
rc = container_boot.main()
assert rc == 0
# Reconcile still ran despite the stale env var.
assert (scandir / "gateway-worker").exists()