hermes-agent/tests/tools/test_docker_orphan_reaper_integration.py
Ben d77d877665 fix(docker): startup orphan reaper for crashed-process containers
The cleanup-fix in the previous commit handles the graceful-exit leak: a
Hermes process that runs ``atexit`` will now actually wait on the docker
stop/rm worker thread, so containers either survive (persist mode) or are
fully removed (opt-out mode) by the time the interpreter exits.

But ``atexit`` doesn't fire on SIGKILL, OOM-kill, or terminal-window
close. Containers from those exits stay parked with no surviving Python
process to reuse or remove them, so they accumulate until the operator
intervenes with ``docker rm -f``. The cleanup-fix doesn't help this class
— there's no live cleanup() to fix.

This commit adds the safety net: a startup orphan reaper that runs once
per Hermes process and removes long-Exited hermes-labeled containers
that the prior commit couldn't reach.

Implementation:

* New ``reap_orphan_containers()`` in ``tools/environments/docker.py``.
  Filters: ``label=hermes-agent=1`` + ``status=exited`` + (optional)
  ``label=hermes-profile=<current>``. Per-container ``docker inspect``
  parses ``State.FinishedAt`` (with nanosecond-precision trimming for
  Python's microsecond-bound ``fromisoformat``); containers older than
  the threshold get ``docker rm -f``'d. The ``status=exited`` filter is
  load-bearing — a running container may belong to a sibling Hermes
  process whose reuse path will pick it up; killing it would crash the
  sibling mid-command. Single-container failures are logged and the
  sweep continues to the next candidate.

* New ``_maybe_reap_docker_orphans()`` helper in
  ``tools/terminal_tool.py``. Wired into ``_create_environment()`` for
  ``env_type == "docker"``. Gated by:

    - ``terminal.docker_orphan_reaper: true`` (default; opt-out for
      operators running multiple Hermes processes in the same profile
      who don't trust the conservative defaults)
    - ``_docker_orphan_reaper_ran`` module flag with double-checked
      locking — parallel subagents and RL rollouts don't trigger N
      concurrent docker ps storms
    - Age threshold = ``2 × TERMINAL_LIFETIME_SECONDS`` with a 60s floor
      (so ``TERMINAL_LIFETIME_SECONDS=0`` doesn't race the user's own
      setup)
    - Profile scoping — a research profile NEVER reaps the default
      profile's stragglers
    - Exception swallow — a janitor failure must never block container
      creation

* New config ``terminal.docker_orphan_reaper`` wired through all four
  config-bridge sites (cli.py, gateway/run.py, hermes_cli/config.py,
  tests/conftest.py) and pinned by
  ``test_docker_orphan_reaper_is_bridged_everywhere``.

Coverage:

* 9 new unit tests in test_docker_environment.py — happy path, recent-
  container sparing, profile scoping, unparseable-timestamp safety,
  docker-ps-failure handling, partial-failure continuation, nanosecond
  timestamp parsing, zero-value FinishedAt rejection.
* 6 new integration tests in test_docker_orphan_reaper_integration.py
  — once-per-process gate, disable-flag respected, lifetime doubling
  with 60s floor, current-profile filter wiring, exception swallow.
* 1 new bridge-invariant regression test.

Closes #20561 (combined with the two prior commits on this branch).
2026-05-29 11:49:54 +10:00

139 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Integration tests for the docker orphan-reaper wiring in terminal_tool.
The reaper itself is unit-tested in tests/tools/test_docker_environment.py
under the "Orphan reaper" section. These tests cover the terminal_tool-side
gates: once-per-process behavior, the disable flag, and the
``lifetime_seconds`` doubling that determines the reaper's age threshold.
Issue #20561 — without these gates, parallel subagents would each fire the
reaper on container creation, and the ``terminal.docker_orphan_reaper: false``
opt-out would silently do nothing.
"""
import os
from unittest.mock import patch
import tools.terminal_tool as terminal_tool
def _reset_reaper_gate():
"""Clear the once-per-process flag between tests."""
terminal_tool._docker_orphan_reaper_ran = False
def test_maybe_reap_runs_once_per_process(monkeypatch):
"""The reaper sweep must run at most once per Python interpreter.
Parallel subagents that each call _create_environment(env_type='docker')
would otherwise fire N concurrent docker ps + inspect storms against the
daemon and waste 510s of startup."""
_reset_reaper_gate()
call_count = {"reap": 0}
def _fake_reap(**kwargs):
call_count["reap"] += 1
return 0
with patch("tools.environments.docker.reap_orphan_containers", _fake_reap):
config = {"docker_orphan_reaper": True}
terminal_tool._maybe_reap_docker_orphans(config)
terminal_tool._maybe_reap_docker_orphans(config)
terminal_tool._maybe_reap_docker_orphans(config)
assert call_count["reap"] == 1, (
f"reaper must run exactly once per process; got {call_count['reap']} calls"
)
def test_maybe_reap_respects_disable_flag(monkeypatch):
"""``terminal.docker_orphan_reaper: false`` (via container_config) must
skip the sweep entirely — no docker ps, no inspect, no rm. The escape
hatch for operators running multiple Hermes processes in the same
profile."""
_reset_reaper_gate()
call_count = {"reap": 0}
def _fake_reap(**kwargs):
call_count["reap"] += 1
return 0
with patch("tools.environments.docker.reap_orphan_containers", _fake_reap):
terminal_tool._maybe_reap_docker_orphans({"docker_orphan_reaper": False})
assert call_count["reap"] == 0, "disabled reaper must not run any docker calls"
# The once-per-process gate must NOT be tripped when the reaper is
# disabled — that would prevent a subsequent toggle to true from working.
assert terminal_tool._docker_orphan_reaper_ran is False
def test_maybe_reap_doubles_lifetime_for_max_age(monkeypatch):
"""The reaper's age threshold is ``2 × lifetime_seconds`` (with a 60s
floor). Generous default — gives sibling Hermes processes ample grace
to be replaced without their just-exited containers being yanked."""
_reset_reaper_gate()
captured_args = {}
def _fake_reap(**kwargs):
captured_args.update(kwargs)
return 0
monkeypatch.setenv("TERMINAL_LIFETIME_SECONDS", "300")
with patch("tools.environments.docker.reap_orphan_containers", _fake_reap):
terminal_tool._maybe_reap_docker_orphans({"docker_orphan_reaper": True})
assert captured_args.get("max_age_seconds") == 600, (
f"expected 2 × 300 = 600, got {captured_args.get('max_age_seconds')}"
)
def test_maybe_reap_floors_at_60_seconds(monkeypatch):
"""A user pinning TERMINAL_LIFETIME_SECONDS=0 (or any value <30) would
otherwise get an effective age threshold of zero, which would race the
user's own just-started container creation. Floor at 60s × 2 = 120s."""
_reset_reaper_gate()
captured_args = {}
def _fake_reap(**kwargs):
captured_args.update(kwargs)
return 0
monkeypatch.setenv("TERMINAL_LIFETIME_SECONDS", "0")
with patch("tools.environments.docker.reap_orphan_containers", _fake_reap):
terminal_tool._maybe_reap_docker_orphans({"docker_orphan_reaper": True})
assert captured_args.get("max_age_seconds") == 120, (
f"expected floored 60 × 2 = 120, got {captured_args.get('max_age_seconds')}"
)
def test_maybe_reap_passes_current_profile_as_filter(monkeypatch):
"""The reaper must be scoped to the current Hermes profile — a research
profile must NEVER reap default's containers. Verifies the
profile-filter wiring."""
_reset_reaper_gate()
captured_args = {}
def _fake_reap(**kwargs):
captured_args.update(kwargs)
return 0
with patch("tools.environments.docker.reap_orphan_containers", _fake_reap), \
patch("tools.environments.docker._get_active_profile_name", return_value="research-bot"):
terminal_tool._maybe_reap_docker_orphans({"docker_orphan_reaper": True})
assert captured_args.get("profile_filter") == "research-bot", (
f"expected profile_filter='research-bot', got {captured_args.get('profile_filter')!r}"
)
def test_maybe_reap_swallows_exceptions(monkeypatch):
"""A reaper crash (docker daemon down, parse error in helper) must NOT
block env creation. The reaper is best-effort plumbing, not a critical
path; failures get logged at debug level and execution continues."""
_reset_reaper_gate()
def _exploding_reap(**kwargs):
raise RuntimeError("docker daemon ate the cat")
with patch("tools.environments.docker.reap_orphan_containers", _exploding_reap):
# Must not raise
terminal_tool._maybe_reap_docker_orphans({"docker_orphan_reaper": True})