From 1f27ca638fd7d9ee5e8725d610ded481cc7f8af6 Mon Sep 17 00:00:00 2001 From: Sanjay Santhanam <51058514+Sanjays2402@users.noreply.github.com> Date: Sat, 2 May 2026 16:49:57 -0700 Subject: [PATCH] test(update): teach restart-mocks about the post-update survivor sweep Issue #17648 added a post-update SIGTERM-survivor sweep to `cmd_update`: ~3s after issuing graceful/SIGTERM restarts, the code re-queries `find_gateway_pids` and SIGKILLs anything still alive. That's the right fix for stuck-drain gateways in production, but it broke three unit tests that assumed `find_gateway_pids` would keep returning the same PIDs forever: FAILED ::TestCmdUpdateLaunchdRestart::test_update_restarts_profile_manual_gateways AssertionError: Expected 'kill' to not have been called. Called 1 times. Calls: [call(12345, )]. FAILED ::TestCmdUpdateLaunchdRestart::test_update_profile_manual_gateway_falls_back_to_sigterm AssertionError: Expected 'kill' to have been called once. Called 2 times. Calls: [call(12345, SIGTERM), call(12345, SIGKILL)]. FAILED ::TestServicePidExclusion::test_update_kills_manual_pid_but_not_service_pid assert 2 == 1 manual_kills = [call(42999, SIGTERM), call(42999, SIGKILL)] In each test `os.kill` is mocked, so the simulated PID never actually exits \u2014 the sweep finds it again and escalates. The production code is correct; the tests just need to model OS behaviour properly. Two-test fix (profile-manual restart cases): use `side_effect=[[12345], []]` so the first `find_gateway_pids` call returns the live PID and the second (the sweep) returns nothing, as if the OS had reaped the process. Service-PID-exclusion fix: track which PIDs got killed in a closure set, and exclude them on subsequent `fake_find` calls. `os.kill` gets a `side_effect` that records the kill instead of swallowing it silently. Now the sweep doesn't re-find the manual PID, no SIGKILL escalation, `manual_kills == 1`. Validation: $ pytest tests/hermes_cli/test_update_gateway_restart.py -q 43 passed in 4.13s No production code change. Fixes the three failures observed on `main` (run 25250051126): test_update_restarts_profile_manual_gateways test_update_profile_manual_gateway_falls_back_to_sigterm test_update_kills_manual_pid_but_not_service_pid Refs: #17648 (post-update survivor sweep that the tests didn't model). --- .../hermes_cli/test_update_gateway_restart.py | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/tests/hermes_cli/test_update_gateway_restart.py b/tests/hermes_cli/test_update_gateway_restart.py index 721149ddef..aa43acd9e1 100644 --- a/tests/hermes_cli/test_update_gateway_restart.py +++ b/tests/hermes_cli/test_update_gateway_restart.py @@ -415,7 +415,13 @@ class TestCmdUpdateLaunchdRestart: pid=12345, ) - with patch.object(gateway_cli, "find_gateway_pids", return_value=[12345]), \ + # ``find_gateway_pids`` is invoked twice: once to enumerate manual + # PIDs to restart, then again ~3s later by the post-restart survivor + # sweep (#17648). Return the live PID first, then an empty list to + # simulate the process actually exiting after the graceful restart + # — otherwise the sweep would SIGKILL pid 12345 even though graceful + # drain succeeded, and ``kill.assert_not_called()`` would fire. + with patch.object(gateway_cli, "find_gateway_pids", side_effect=[[12345], []]), \ patch.object(gateway_cli, "find_profile_gateway_processes", return_value=[process]), \ patch.object(gateway_cli, "launch_detached_profile_gateway_restart", return_value=True) as restart, \ patch.object(gateway_cli, "_graceful_restart_via_sigusr1", return_value=True) as graceful, \ @@ -453,7 +459,11 @@ class TestCmdUpdateLaunchdRestart: pid=12345, ) - with patch.object(gateway_cli, "find_gateway_pids", return_value=[12345]), \ + # See note in ``test_update_restarts_profile_manual_gateways``: the + # post-restart survivor sweep (#17648) re-queries ``find_gateway_pids`` + # ~3s after the restart attempt. Return ``[]`` on the second call so + # the SIGTERM fallback isn't escalated to SIGKILL by the sweep. + with patch.object(gateway_cli, "find_gateway_pids", side_effect=[[12345], []]), \ patch.object(gateway_cli, "find_profile_gateway_processes", return_value=[process]), \ patch.object(gateway_cli, "launch_detached_profile_gateway_restart", return_value=True) as restart, \ patch.object(gateway_cli, "_graceful_restart_via_sigusr1", return_value=False) as graceful, \ @@ -872,15 +882,25 @@ class TestServicePidExclusion: launchctl_loaded=True, ) + # Survivor sweep (#17648) re-queries ``find_gateway_pids`` after + # SIGTERM. ``os.kill`` is mocked, so the PID never "dies" — track + # the killed-via-SIGTERM PIDs ourselves and exclude them on later + # calls to simulate the OS reaping the process. Without this the + # sweep escalates with SIGKILL and ``manual_kills == 2`` instead of 1. + _killed_pids: set[int] = set() + def fake_find(exclude_pids=None, all_profiles=False): - _exclude = exclude_pids or set() + _exclude = (exclude_pids or set()) | _killed_pids return [p for p in [SERVICE_PID, MANUAL_PID] if p not in _exclude] + def fake_kill(pid, _sig): + _killed_pids.add(pid) + with patch.object( gateway_cli, "_get_service_pids", return_value={SERVICE_PID} ), patch.object( gateway_cli, "find_gateway_pids", side_effect=fake_find, - ), patch("os.kill") as mock_kill: + ), patch("os.kill", side_effect=fake_kill) as mock_kill: cmd_update(mock_args) captured = capsys.readouterr().out