mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-17 09:41:58 +00:00
fix(docker): skip gateway reconciliation in dashboard container (autodetect) (#46293)
* fix(docker): skip per-profile gateway reconciliation in dashboard container When gateway and dashboard containers share a bind-mounted HERMES_HOME, both run the cont-init.d profile reconciliation script, which creates s6-log processes for every persisted profile. These s6-log processes in different containers race to flock() the same log-directory lock files under logs/gateways/<profile>/lock, producing repeated "s6-log: fatal: unable to lock ... Resource busy" errors and a supervision restart storm. Add HERMES_SKIP_PROFILE_RECONCILE env var support to container_boot.py and set it in the official docker-compose.yml dashboard service so the dashboard container no longer creates per-profile gateway s6 services it never uses. * chore(release): map salvaged contributor * refactor(docker): autodetect dashboard container instead of env-var gate Replace the HERMES_SKIP_PROFILE_RECONCILE env var with PID 1 argv role detection. A dashboard-only container never spawns or supervises per-profile gateways, so the reconcile boot hook now skips itself when /proc/1/cmdline is the dashboard command — no operator flag to set (or forget in a hand-written manifest, which would reintroduce the s6-log flock storm this prevents). - Extract _strip_container_argv_prefix() shared by the legacy-gateway and new dashboard detectors (DRY the init/wrapper/hermes peel). - Add _is_dashboard_container(); gate reconcile main() on it. - Drop HERMES_SKIP_PROFILE_RECONCILE from code + docker-compose.yml. - Tests: argv matrix for both roles + main()-level skip/reconcile proof and a regression that the removed env var is now inert. Co-authored-by: 895252509 <895252509@qq.com> --------- Co-authored-by: zhouxiang <895252509@qq.com> Co-authored-by: Ben <ben@nousresearch.com>
This commit is contained in:
parent
6cb88a0874
commit
aca11c227e
3 changed files with 194 additions and 2 deletions
|
|
@ -207,8 +207,15 @@ def _read_container_argv() -> tuple[str, ...]:
|
|||
return tuple(part.decode("utf-8", "replace") for part in raw.split(b"\0") if part)
|
||||
|
||||
|
||||
def _is_legacy_gateway_run_request(argv: Sequence[str]) -> bool:
|
||||
"""Return True for Docker commands equivalent to `gateway run`."""
|
||||
def _strip_container_argv_prefix(argv: Sequence[str]) -> list[str]:
|
||||
"""Strip the s6/wrapper prefix off PID 1 argv, leaving the hermes args.
|
||||
|
||||
The container PID 1 argv looks like
|
||||
``/init /opt/hermes/docker/main-wrapper.sh <subcommand> [args...]`` and
|
||||
the wrapper re-execs ``hermes <subcommand>``. Peel ``init`` →
|
||||
``main-wrapper.sh`` → ``hermes`` so callers can match on the bare
|
||||
subcommand. Shared by the legacy-gateway and dashboard role detectors.
|
||||
"""
|
||||
args = list(argv)
|
||||
if args and Path(args[0]).name == "init":
|
||||
args = args[1:]
|
||||
|
|
@ -216,11 +223,38 @@ def _is_legacy_gateway_run_request(argv: Sequence[str]) -> bool:
|
|||
args = args[1:]
|
||||
if args and Path(args[0]).name == "hermes":
|
||||
args = args[1:]
|
||||
return args
|
||||
|
||||
|
||||
def _is_legacy_gateway_run_request(argv: Sequence[str]) -> bool:
|
||||
"""Return True for Docker commands equivalent to `gateway run`."""
|
||||
args = _strip_container_argv_prefix(argv)
|
||||
if "--no-supervise" in args:
|
||||
return False
|
||||
return len(args) >= 2 and args[0] == "gateway" and args[1] == "run"
|
||||
|
||||
|
||||
def _is_dashboard_container(argv: Sequence[str]) -> bool:
|
||||
"""Return True when the container's command is the dashboard.
|
||||
|
||||
A dashboard-only container (``hermes dashboard ...``) never spawns or
|
||||
supervises per-profile gateways — that is the gateway container's job.
|
||||
Reconciling profile gateway s6 slots there is not just wasted work: when
|
||||
the gateway and dashboard containers share a bind-mounted HERMES_HOME,
|
||||
both race to ``flock()`` the same ``logs/gateways/<profile>/lock`` files,
|
||||
producing "Resource busy" failures and an s6-log restart storm. So the
|
||||
dashboard container skips reconciliation entirely.
|
||||
|
||||
Detected from PID 1 argv (``/proc/1/cmdline``) rather than an operator
|
||||
flag: the role is a fact about the container's command, not a tunable,
|
||||
and a flag can be forgotten in a hand-written compose/k8s manifest —
|
||||
reintroducing the exact storm this prevents. Mirrors the argv handling
|
||||
in :func:`_is_legacy_gateway_run_request`.
|
||||
"""
|
||||
args = _strip_container_argv_prefix(argv)
|
||||
return bool(args) and args[0] == "dashboard"
|
||||
|
||||
|
||||
def _read_desired_state(profile_dir: Path) -> str | None:
|
||||
"""Read the persisted gateway desired state for reconciliation.
|
||||
|
||||
|
|
@ -393,6 +427,22 @@ _LOG_ROTATE_BYTES = 256 * 1024
|
|||
|
||||
def main() -> int:
|
||||
"""Entry point invoked from /etc/cont-init.d/02-reconcile-profiles."""
|
||||
# A dashboard-only container never spawns or supervises per-profile
|
||||
# gateways, so reconciling their s6 slots here is pure waste — and
|
||||
# actively harmful: when the gateway and dashboard containers share a
|
||||
# bind-mounted HERMES_HOME, both race to flock() the same s6-log lock
|
||||
# files under logs/gateways/<profile>/lock, producing "Resource busy"
|
||||
# failures and a restart storm. Detect the role from PID 1 argv and
|
||||
# skip reconciliation in the dashboard container. No operator flag:
|
||||
# the role is a fact about the container's command, and a flag can be
|
||||
# forgotten in a hand-written manifest, reintroducing the storm.
|
||||
if _is_dashboard_container(_read_container_argv()):
|
||||
print(
|
||||
"reconcile: skipping (dashboard container — does not need "
|
||||
"per-profile gateways)"
|
||||
)
|
||||
return 0
|
||||
|
||||
hermes_home = Path(os.environ.get("HERMES_HOME", "/opt/data"))
|
||||
scandir = Path(os.environ.get("S6_PROFILE_GATEWAY_SCANDIR", "/run/service"))
|
||||
actions = reconcile_profile_gateways(
|
||||
|
|
|
|||
|
|
@ -84,6 +84,7 @@ AUTHOR_MAP = {
|
|||
"290859878+synapsesx@users.noreply.github.com": "synapsesx",
|
||||
"157689911+itsflownium@users.noreply.github.com": "itsflownium",
|
||||
"dirtyren@users.noreply.github.com": "dirtyren",
|
||||
"895252509@qq.com": "895252509",
|
||||
"35259607+zxcasongs@users.noreply.github.com": "zxcasongs",
|
||||
"alfred@my-cloud.me": "alfred-smith-0",
|
||||
"tangtaizhong792@gmail.com": "tangtaizong666",
|
||||
|
|
|
|||
|
|
@ -708,3 +708,144 @@ def test_profiles_default_subdir_is_skipped_with_warning(
|
|||
assert any(
|
||||
"profiles/default/" in record.message for record in caplog.records
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Dashboard-container role detection (skip reconcile on the dashboard)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"container_argv",
|
||||
[
|
||||
# Bare subcommand (docker run ... dashboard ...).
|
||||
("dashboard",),
|
||||
("dashboard", "--host", "127.0.0.1", "--no-open"),
|
||||
# Through s6 /init + the main-wrapper that re-execs `hermes`.
|
||||
("/init", "/opt/hermes/docker/main-wrapper.sh", "dashboard"),
|
||||
(
|
||||
"/init",
|
||||
"/opt/hermes/docker/main-wrapper.sh",
|
||||
"dashboard",
|
||||
"--host",
|
||||
"127.0.0.1",
|
||||
"--no-open",
|
||||
),
|
||||
# Wrapper that kept the explicit `hermes` argv0.
|
||||
("/init", "/opt/hermes/docker/main-wrapper.sh", "hermes", "dashboard"),
|
||||
],
|
||||
)
|
||||
def test_is_dashboard_container_true_for_dashboard_argv(
|
||||
container_argv: tuple[str, ...],
|
||||
) -> None:
|
||||
"""A dashboard command is detected across every wrapper prefix shape."""
|
||||
from hermes_cli.container_boot import _is_dashboard_container
|
||||
|
||||
assert _is_dashboard_container(container_argv) is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"container_argv",
|
||||
[
|
||||
(), # empty (/proc/1/cmdline unreadable) — not the dashboard
|
||||
("gateway", "run"),
|
||||
("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run"),
|
||||
("/init", "/opt/hermes/docker/main-wrapper.sh", "hermes", "gateway", "run"),
|
||||
("chat",),
|
||||
# A profile literally named "dashboard" must NOT match — the token
|
||||
# we key on is the SUBCOMMAND, and `gateway run -p dashboard` is a
|
||||
# gateway container.
|
||||
("gateway", "run", "-p", "dashboard"),
|
||||
],
|
||||
)
|
||||
def test_is_dashboard_container_false_for_non_dashboard_argv(
|
||||
container_argv: tuple[str, ...],
|
||||
) -> None:
|
||||
"""Gateway / other commands (and empty argv) are not the dashboard."""
|
||||
from hermes_cli.container_boot import _is_dashboard_container
|
||||
|
||||
assert _is_dashboard_container(container_argv) is False
|
||||
|
||||
|
||||
def test_main_skips_reconcile_in_dashboard_container(
|
||||
tmp_path: Path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
capsys: pytest.CaptureFixture[str],
|
||||
) -> None:
|
||||
"""main() must NOT reconcile when PID 1 argv is the dashboard command.
|
||||
|
||||
A running profile is seeded so that, if reconcile ran, it would create
|
||||
the gateway-<profile> slot. Asserting the slot is absent proves the
|
||||
skip is real, not just a log line.
|
||||
"""
|
||||
from hermes_cli import container_boot
|
||||
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_make_profile(tmp_path, "worker", state="running")
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
monkeypatch.setenv("S6_PROFILE_GATEWAY_SCANDIR", str(scandir))
|
||||
monkeypatch.setattr(
|
||||
container_boot,
|
||||
"_read_container_argv",
|
||||
lambda: ("/init", "/opt/hermes/docker/main-wrapper.sh", "dashboard"),
|
||||
)
|
||||
|
||||
rc = container_boot.main()
|
||||
|
||||
assert rc == 0
|
||||
assert not (scandir / "gateway-worker").exists()
|
||||
assert not (scandir / "gateway-default").exists()
|
||||
assert "skipping (dashboard container" in capsys.readouterr().out
|
||||
|
||||
|
||||
def test_main_reconciles_in_gateway_container(
|
||||
tmp_path: Path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""main() reconciles normally when PID 1 argv is the gateway command —
|
||||
the dashboard skip is scoped strictly to the dashboard role."""
|
||||
from hermes_cli import container_boot
|
||||
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_make_profile(tmp_path, "worker", state="running")
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
monkeypatch.setenv("S6_PROFILE_GATEWAY_SCANDIR", str(scandir))
|
||||
monkeypatch.setattr(
|
||||
container_boot,
|
||||
"_read_container_argv",
|
||||
lambda: ("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run"),
|
||||
)
|
||||
|
||||
rc = container_boot.main()
|
||||
|
||||
assert rc == 0
|
||||
# The worker slot was registered + started (prior_state running).
|
||||
assert (scandir / "gateway-worker").exists()
|
||||
assert not (scandir / "gateway-worker" / "down").exists()
|
||||
|
||||
|
||||
def test_main_ignores_removed_skip_reconcile_env_var(
|
||||
tmp_path: Path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""The legacy HERMES_SKIP_PROFILE_RECONCILE flag is gone: setting it on a
|
||||
gateway container must NOT suppress reconciliation. Role is decided by
|
||||
PID 1 argv alone, so a stale flag in someone's manifest is inert."""
|
||||
from hermes_cli import container_boot
|
||||
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_make_profile(tmp_path, "worker", state="running")
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
monkeypatch.setenv("S6_PROFILE_GATEWAY_SCANDIR", str(scandir))
|
||||
monkeypatch.setenv("HERMES_SKIP_PROFILE_RECONCILE", "1")
|
||||
monkeypatch.setattr(
|
||||
container_boot,
|
||||
"_read_container_argv",
|
||||
lambda: ("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run"),
|
||||
)
|
||||
|
||||
rc = container_boot.main()
|
||||
|
||||
assert rc == 0
|
||||
# Reconcile still ran despite the stale env var.
|
||||
assert (scandir / "gateway-worker").exists()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue