mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
The cleanup-fix in the previous commit handles the graceful-exit leak: a
Hermes process that runs ``atexit`` will now actually wait on the docker
stop/rm worker thread, so containers either survive (persist mode) or are
fully removed (opt-out mode) by the time the interpreter exits.
But ``atexit`` doesn't fire on SIGKILL, OOM-kill, or terminal-window
close. Containers from those exits stay parked with no surviving Python
process to reuse or remove them, so they accumulate until the operator
intervenes with ``docker rm -f``. The cleanup-fix doesn't help this class
— there's no live cleanup() to fix.
This commit adds the safety net: a startup orphan reaper that runs once
per Hermes process and removes long-Exited hermes-labeled containers
that the prior commit couldn't reach.
Implementation:
* New ``reap_orphan_containers()`` in ``tools/environments/docker.py``.
Filters: ``label=hermes-agent=1`` + ``status=exited`` + (optional)
``label=hermes-profile=<current>``. Per-container ``docker inspect``
parses ``State.FinishedAt`` (with nanosecond-precision trimming for
Python's microsecond-bound ``fromisoformat``); containers older than
the threshold get ``docker rm -f``'d. The ``status=exited`` filter is
load-bearing — a running container may belong to a sibling Hermes
process whose reuse path will pick it up; killing it would crash the
sibling mid-command. Single-container failures are logged and the
sweep continues to the next candidate.
* New ``_maybe_reap_docker_orphans()`` helper in
``tools/terminal_tool.py``. Wired into ``_create_environment()`` for
``env_type == "docker"``. Gated by:
- ``terminal.docker_orphan_reaper: true`` (default; opt-out for
operators running multiple Hermes processes in the same profile
who don't trust the conservative defaults)
- ``_docker_orphan_reaper_ran`` module flag with double-checked
locking — parallel subagents and RL rollouts don't trigger N
concurrent docker ps storms
- Age threshold = ``2 × TERMINAL_LIFETIME_SECONDS`` with a 60s floor
(so ``TERMINAL_LIFETIME_SECONDS=0`` doesn't race the user's own
setup)
- Profile scoping — a research profile NEVER reaps the default
profile's stragglers
- Exception swallow — a janitor failure must never block container
creation
* New config ``terminal.docker_orphan_reaper`` wired through all four
config-bridge sites (cli.py, gateway/run.py, hermes_cli/config.py,
tests/conftest.py) and pinned by
``test_docker_orphan_reaper_is_bridged_everywhere``.
Coverage:
* 9 new unit tests in test_docker_environment.py — happy path, recent-
container sparing, profile scoping, unparseable-timestamp safety,
docker-ps-failure handling, partial-failure continuation, nanosecond
timestamp parsing, zero-value FinishedAt rejection.
* 6 new integration tests in test_docker_orphan_reaper_integration.py
— once-per-process gate, disable-flag respected, lifetime doubling
with 60s floor, current-profile filter wiring, exception swallow.
* 1 new bridge-invariant regression test.
Closes #20561 (combined with the two prior commits on this branch).
262 lines
12 KiB
Python
262 lines
12 KiB
Python
"""Regression tests for terminal config -> env-var bridging.
|
|
|
|
terminal_tool._get_env_config() reads ALL terminal settings from os.environ
|
|
(TERMINAL_*). config.yaml values therefore have to be bridged into env vars
|
|
at startup, by THREE separate code paths:
|
|
|
|
1. cli.py -> ``env_mappings`` dict (CLI / TUI startup)
|
|
2. gateway/run.py -> ``_terminal_env_map`` dict (gateway / messaging
|
|
platforms)
|
|
3. hermes_cli/config.py:save_config_value
|
|
-> ``_config_to_env_sync`` dict (one-shot when the
|
|
user runs ``hermes config set …``)
|
|
|
|
If any one of these is missing a key, the corresponding config.yaml setting
|
|
silently does nothing for that entry-point. This bug already shipped once
|
|
for ``docker_run_as_host_user`` (gateway and CLI maps) and once for
|
|
``docker_mount_cwd_to_workspace`` (gateway map).
|
|
|
|
This test guards against future drift by extracting all three maps via source
|
|
inspection and asserting they all bridge the same set of writable
|
|
``terminal.*`` keys. Source inspection (rather than importing the live
|
|
dicts) keeps the test independent of the user's ~/.hermes/config.yaml and
|
|
mirrors the pattern used in tests/hermes_cli/test_config_drift.py.
|
|
"""
|
|
|
|
import ast
|
|
import inspect
|
|
|
|
|
|
def _extract_dict_values(source: str, dict_name: str) -> set[str]:
|
|
"""Return the set of *value* strings in `dict_name = { "k": "VALUE", ... }`.
|
|
|
|
We parse the source with ast (so multi-line dicts and comments are
|
|
handled) instead of regex. The first matching assignment wins.
|
|
"""
|
|
tree = ast.parse(source)
|
|
for node in ast.walk(tree):
|
|
if not isinstance(node, ast.Assign):
|
|
continue
|
|
targets = [t for t in node.targets if isinstance(t, ast.Name)]
|
|
if not any(t.id == dict_name for t in targets):
|
|
continue
|
|
if not isinstance(node.value, ast.Dict):
|
|
continue
|
|
out: set[str] = set()
|
|
for k, v in zip(node.value.keys, node.value.values):
|
|
if isinstance(k, ast.Constant) and isinstance(v, ast.Constant):
|
|
if isinstance(v.value, str):
|
|
out.add(v.value)
|
|
return out
|
|
raise AssertionError(f"Could not find `{dict_name} = {{...}}` literal in source")
|
|
|
|
|
|
def _extract_dict_keys(source: str, dict_name: str) -> set[str]:
|
|
"""Return the set of *key* strings in `dict_name = { "KEY": "v", ... }`."""
|
|
tree = ast.parse(source)
|
|
for node in ast.walk(tree):
|
|
if not isinstance(node, ast.Assign):
|
|
continue
|
|
targets = [t for t in node.targets if isinstance(t, ast.Name)]
|
|
if not any(t.id == dict_name for t in targets):
|
|
continue
|
|
if not isinstance(node.value, ast.Dict):
|
|
continue
|
|
out: set[str] = set()
|
|
for k in node.value.keys:
|
|
if isinstance(k, ast.Constant) and isinstance(k.value, str):
|
|
out.add(k.value)
|
|
return out
|
|
raise AssertionError(f"Could not find `{dict_name} = {{...}}` literal in source")
|
|
|
|
|
|
def _cli_env_map_keys() -> set[str]:
|
|
"""terminal config keys bridged by cli.load_cli_config()."""
|
|
import cli
|
|
source = inspect.getsource(cli.load_cli_config)
|
|
return _extract_dict_keys(source, "env_mappings")
|
|
|
|
|
|
def _gateway_env_map_keys() -> set[str]:
|
|
"""terminal config keys bridged by gateway/run.py at module load."""
|
|
# gateway/run.py builds the dict at module top-level (not inside a
|
|
# function), so inspect the whole module source.
|
|
import gateway.run as gr
|
|
source = inspect.getsource(gr)
|
|
return _extract_dict_keys(source, "_terminal_env_map")
|
|
|
|
|
|
def _save_config_env_sync_keys() -> set[str]:
|
|
"""terminal config keys bridged by ``hermes config set foo bar``."""
|
|
from hermes_cli import config as hc_config
|
|
source = inspect.getsource(hc_config.set_config_value)
|
|
keys = _extract_dict_keys(source, "_config_to_env_sync")
|
|
# set_config_value uses fully-qualified ``terminal.foo`` keys; strip the
|
|
# prefix so we can compare against the other two maps which use bare
|
|
# leaf keys.
|
|
return {k.split(".", 1)[1] for k in keys if k.startswith("terminal.")}
|
|
|
|
|
|
# Keys present in cli.py env_mappings but intentionally absent from
|
|
# gateway/run.py or set_config_value. Each entry must be justified.
|
|
_CLI_ONLY_OK = frozenset({
|
|
# `env_type` is a legacy YAML key alias for `backend` that cli.py
|
|
# accepts for backwards-compat with older cli-config.yaml. The
|
|
# gateway path normalizes on the canonical `backend` key, which is
|
|
# also in the map and handles the same bridging. See cli.py ~line 515.
|
|
"env_type",
|
|
# sudo_password is not a terminal-backend option — it's a credential
|
|
# used across backends, bridged to $SUDO_PASSWORD (not TERMINAL_*).
|
|
# Treating it as terminal-only would be misleading.
|
|
"sudo_password",
|
|
})
|
|
|
|
|
|
def _terminal_tool_env_var_names() -> set[str]:
|
|
"""All TERMINAL_* env vars actually consumed by terminal_tool."""
|
|
import tools.terminal_tool as tt
|
|
source = inspect.getsource(tt)
|
|
# Naive scan: every os.getenv("TERMINAL_X", ...) and _parse_env_var("TERMINAL_X", ...).
|
|
import re
|
|
pat = re.compile(r'["\'](TERMINAL_[A-Z0-9_]+)["\']')
|
|
return set(pat.findall(source))
|
|
|
|
|
|
def test_cli_and_gateway_env_maps_agree():
|
|
"""cli.py and gateway/run.py must bridge the same set of terminal keys.
|
|
|
|
Both feed the same downstream consumer (terminal_tool). Drift between
|
|
them means a config.yaml setting that "works in CLI mode but not gateway
|
|
mode" (or vice-versa) — the bug class that shipped twice already.
|
|
"""
|
|
cli_keys = _cli_env_map_keys() - _CLI_ONLY_OK
|
|
gw_keys = _gateway_env_map_keys()
|
|
|
|
# Normalize the legacy `env_type` alias: cli.py accepts both `env_type`
|
|
# and `backend` as source keys for TERMINAL_ENV; gateway only accepts
|
|
# `backend`. Since cli.py copies `backend` → `env_type` before the
|
|
# lookup, they're equivalent. Remove `backend` from the gateway side
|
|
# to avoid a spurious "backend missing from cli" failure.
|
|
gw_keys = gw_keys - {"backend"}
|
|
|
|
missing_in_gateway = cli_keys - gw_keys
|
|
missing_in_cli = gw_keys - cli_keys
|
|
|
|
assert not missing_in_gateway, (
|
|
f"Keys in cli.py env_mappings but missing from gateway/run.py "
|
|
f"_terminal_env_map: {sorted(missing_in_gateway)}. Add them to "
|
|
f"both maps (same bug class as docker_run_as_host_user shipping "
|
|
f"wired in cli but not gateway in April 2026)."
|
|
)
|
|
assert not missing_in_cli, (
|
|
f"Keys in gateway/run.py _terminal_env_map but missing from cli.py "
|
|
f"env_mappings: {sorted(missing_in_cli)}. Add them to both maps."
|
|
)
|
|
|
|
|
|
def test_save_config_set_supports_critical_bridged_keys():
|
|
"""``hermes config set terminal.X true`` must propagate to .env for
|
|
known-critical keys. This used to be an all-keys invariant but several
|
|
pre-existing terminal keys (ssh_*, docker_forward_env, docker_volumes)
|
|
aren't in _config_to_env_sync and are instead handled via the separate
|
|
api_keys TERMINAL_SSH_* fallback path or user-edits-yaml-directly.
|
|
|
|
Until those gaps are audited and fixed, pin the specific keys that are
|
|
load-bearing for the docker backend's ownership flag so the bug we just
|
|
fixed cannot silently regress.
|
|
"""
|
|
save_keys = _save_config_env_sync_keys()
|
|
required = {
|
|
"docker_run_as_host_user",
|
|
"docker_mount_cwd_to_workspace",
|
|
"backend",
|
|
"docker_image",
|
|
"container_cpu",
|
|
"container_memory",
|
|
"container_disk",
|
|
"container_persistent",
|
|
}
|
|
missing = required - save_keys
|
|
assert not missing, (
|
|
f"`hermes config set terminal.X` doesn't sync these load-bearing "
|
|
f"keys to .env: {sorted(missing)}. Add them to _config_to_env_sync "
|
|
f"in hermes_cli/config.py:set_config_value."
|
|
)
|
|
|
|
|
|
def test_docker_run_as_host_user_is_bridged_everywhere():
|
|
"""Explicit pin for the bug we just fixed.
|
|
|
|
docker_run_as_host_user was added to terminal_tool._get_env_config and
|
|
DockerEnvironment but NOT to cli.py's env_mappings or gateway/run.py's
|
|
_terminal_env_map, so ``terminal.docker_run_as_host_user: true`` in
|
|
config.yaml had no effect at runtime. This guard makes the regression
|
|
impossible to reintroduce silently.
|
|
"""
|
|
assert "docker_run_as_host_user" in _cli_env_map_keys()
|
|
assert "docker_run_as_host_user" in _gateway_env_map_keys()
|
|
assert "docker_run_as_host_user" in _save_config_env_sync_keys()
|
|
assert "TERMINAL_DOCKER_RUN_AS_HOST_USER" in _terminal_tool_env_var_names()
|
|
|
|
|
|
def test_docker_mount_cwd_to_workspace_is_bridged_everywhere():
|
|
"""Same regression class — docker_mount_cwd_to_workspace was missing from
|
|
gateway/run.py's _terminal_env_map until the docker_run_as_host_user
|
|
audit caught it.
|
|
"""
|
|
assert "docker_mount_cwd_to_workspace" in _cli_env_map_keys()
|
|
assert "docker_mount_cwd_to_workspace" in _gateway_env_map_keys()
|
|
assert "docker_mount_cwd_to_workspace" in _save_config_env_sync_keys()
|
|
assert "TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE" in _terminal_tool_env_var_names()
|
|
|
|
|
|
def test_docker_env_is_bridged_everywhere():
|
|
"""Regression pin for docker_env config key being silently ignored.
|
|
|
|
``terminal.docker_env`` in config.yaml specifies extra env vars to inject
|
|
into the Docker container at runtime. The key was present in
|
|
_create_environment's container_config consumer (line ~1130) but never
|
|
bridged from config.yaml to TERMINAL_DOCKER_ENV, so the dict was always
|
|
empty regardless of what the user set. Guard all four bridging points so
|
|
this cannot regress.
|
|
"""
|
|
assert "docker_env" in _cli_env_map_keys()
|
|
assert "docker_env" in _gateway_env_map_keys()
|
|
assert "docker_env" in _save_config_env_sync_keys()
|
|
assert "TERMINAL_DOCKER_ENV" in _terminal_tool_env_var_names()
|
|
|
|
|
|
def test_docker_persist_across_processes_is_bridged_everywhere():
|
|
"""Regression pin for the cross-process container reuse toggle.
|
|
|
|
``terminal.docker_persist_across_processes`` (issue #20561) controls
|
|
whether ``DockerEnvironment.__init__`` probes for and reuses an existing
|
|
labeled container at startup, and whether ``cleanup()`` removes the
|
|
container on Hermes exit or just stops it (keeping it for the next
|
|
process). Same four-bridge invariant as docker_run_as_host_user /
|
|
docker_env / docker_mount_cwd_to_workspace — drift between any of the
|
|
four sites means ``terminal.docker_persist_across_processes: false`` in
|
|
config.yaml silently does nothing for that entry point, leaving the
|
|
user unable to opt out of the documented "ONE long-lived container
|
|
shared across sessions" behavior.
|
|
"""
|
|
assert "docker_persist_across_processes" in _cli_env_map_keys()
|
|
assert "docker_persist_across_processes" in _gateway_env_map_keys()
|
|
assert "docker_persist_across_processes" in _save_config_env_sync_keys()
|
|
assert "TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES" in _terminal_tool_env_var_names()
|
|
|
|
|
|
def test_docker_orphan_reaper_is_bridged_everywhere():
|
|
"""Regression pin for the startup orphan reaper toggle (issue #20561).
|
|
|
|
``terminal.docker_orphan_reaper`` controls whether Hermes sweeps stale
|
|
Exited containers from prior SIGKILL'd processes at startup. Same
|
|
four-site bridge invariant — drift means
|
|
``terminal.docker_orphan_reaper: false`` silently does nothing for one
|
|
entry point, and the reaper either runs when the operator disabled it
|
|
or fails to run when they enabled it.
|
|
"""
|
|
assert "docker_orphan_reaper" in _cli_env_map_keys()
|
|
assert "docker_orphan_reaper" in _gateway_env_map_keys()
|
|
assert "docker_orphan_reaper" in _save_config_env_sync_keys()
|
|
assert "TERMINAL_DOCKER_ORPHAN_REAPER" in _terminal_tool_env_var_names()
|