mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-08 03:01:47 +00:00
test(update): teach restart-mocks about the post-update survivor sweep
Issue #17648 added a post-update SIGTERM-survivor sweep to `cmd_update`: ~3s after issuing graceful/SIGTERM restarts, the code re-queries `find_gateway_pids` and SIGKILLs anything still alive. That's the right fix for stuck-drain gateways in production, but it broke three unit tests that assumed `find_gateway_pids` would keep returning the same PIDs forever: FAILED ::TestCmdUpdateLaunchdRestart::test_update_restarts_profile_manual_gateways AssertionError: Expected 'kill' to not have been called. Called 1 times. Calls: [call(12345, <Signals.SIGKILL: 9>)]. FAILED ::TestCmdUpdateLaunchdRestart::test_update_profile_manual_gateway_falls_back_to_sigterm AssertionError: Expected 'kill' to have been called once. Called 2 times. Calls: [call(12345, SIGTERM), call(12345, SIGKILL)]. FAILED ::TestServicePidExclusion::test_update_kills_manual_pid_but_not_service_pid assert 2 == 1 manual_kills = [call(42999, SIGTERM), call(42999, SIGKILL)] In each test `os.kill` is mocked, so the simulated PID never actually exits \u2014 the sweep finds it again and escalates. The production code is correct; the tests just need to model OS behaviour properly. Two-test fix (profile-manual restart cases): use `side_effect=[[12345], []]` so the first `find_gateway_pids` call returns the live PID and the second (the sweep) returns nothing, as if the OS had reaped the process. Service-PID-exclusion fix: track which PIDs got killed in a closure set, and exclude them on subsequent `fake_find` calls. `os.kill` gets a `side_effect` that records the kill instead of swallowing it silently. Now the sweep doesn't re-find the manual PID, no SIGKILL escalation, `manual_kills == 1`. Validation: $ pytest tests/hermes_cli/test_update_gateway_restart.py -q 43 passed in 4.13s No production code change. Fixes the three failures observed on `main` (run 25250051126): test_update_restarts_profile_manual_gateways test_update_profile_manual_gateway_falls_back_to_sigterm test_update_kills_manual_pid_but_not_service_pid Refs: #17648 (post-update survivor sweep that the tests didn't model).
This commit is contained in:
parent
aa5690342b
commit
1f27ca638f
1 changed files with 24 additions and 4 deletions
|
|
@ -415,7 +415,13 @@ class TestCmdUpdateLaunchdRestart:
|
|||
pid=12345,
|
||||
)
|
||||
|
||||
with patch.object(gateway_cli, "find_gateway_pids", return_value=[12345]), \
|
||||
# ``find_gateway_pids`` is invoked twice: once to enumerate manual
|
||||
# PIDs to restart, then again ~3s later by the post-restart survivor
|
||||
# sweep (#17648). Return the live PID first, then an empty list to
|
||||
# simulate the process actually exiting after the graceful restart
|
||||
# — otherwise the sweep would SIGKILL pid 12345 even though graceful
|
||||
# drain succeeded, and ``kill.assert_not_called()`` would fire.
|
||||
with patch.object(gateway_cli, "find_gateway_pids", side_effect=[[12345], []]), \
|
||||
patch.object(gateway_cli, "find_profile_gateway_processes", return_value=[process]), \
|
||||
patch.object(gateway_cli, "launch_detached_profile_gateway_restart", return_value=True) as restart, \
|
||||
patch.object(gateway_cli, "_graceful_restart_via_sigusr1", return_value=True) as graceful, \
|
||||
|
|
@ -453,7 +459,11 @@ class TestCmdUpdateLaunchdRestart:
|
|||
pid=12345,
|
||||
)
|
||||
|
||||
with patch.object(gateway_cli, "find_gateway_pids", return_value=[12345]), \
|
||||
# See note in ``test_update_restarts_profile_manual_gateways``: the
|
||||
# post-restart survivor sweep (#17648) re-queries ``find_gateway_pids``
|
||||
# ~3s after the restart attempt. Return ``[]`` on the second call so
|
||||
# the SIGTERM fallback isn't escalated to SIGKILL by the sweep.
|
||||
with patch.object(gateway_cli, "find_gateway_pids", side_effect=[[12345], []]), \
|
||||
patch.object(gateway_cli, "find_profile_gateway_processes", return_value=[process]), \
|
||||
patch.object(gateway_cli, "launch_detached_profile_gateway_restart", return_value=True) as restart, \
|
||||
patch.object(gateway_cli, "_graceful_restart_via_sigusr1", return_value=False) as graceful, \
|
||||
|
|
@ -872,15 +882,25 @@ class TestServicePidExclusion:
|
|||
launchctl_loaded=True,
|
||||
)
|
||||
|
||||
# Survivor sweep (#17648) re-queries ``find_gateway_pids`` after
|
||||
# SIGTERM. ``os.kill`` is mocked, so the PID never "dies" — track
|
||||
# the killed-via-SIGTERM PIDs ourselves and exclude them on later
|
||||
# calls to simulate the OS reaping the process. Without this the
|
||||
# sweep escalates with SIGKILL and ``manual_kills == 2`` instead of 1.
|
||||
_killed_pids: set[int] = set()
|
||||
|
||||
def fake_find(exclude_pids=None, all_profiles=False):
|
||||
_exclude = exclude_pids or set()
|
||||
_exclude = (exclude_pids or set()) | _killed_pids
|
||||
return [p for p in [SERVICE_PID, MANUAL_PID] if p not in _exclude]
|
||||
|
||||
def fake_kill(pid, _sig):
|
||||
_killed_pids.add(pid)
|
||||
|
||||
with patch.object(
|
||||
gateway_cli, "_get_service_pids", return_value={SERVICE_PID}
|
||||
), patch.object(
|
||||
gateway_cli, "find_gateway_pids", side_effect=fake_find,
|
||||
), patch("os.kill") as mock_kill:
|
||||
), patch("os.kill", side_effect=fake_kill) as mock_kill:
|
||||
cmd_update(mock_args)
|
||||
|
||||
captured = capsys.readouterr().out
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue