diff --git a/tests/tools/test_file_sync_back.py b/tests/tools/test_file_sync_back.py index 792d4c0f51..5da0886a6c 100644 --- a/tests/tools/test_file_sync_back.py +++ b/tests/tools/test_file_sync_back.py @@ -216,7 +216,7 @@ class TestSyncBackConflict: class TestSyncBackRetries: """Retry behaviour with exponential backoff.""" - @patch("tools.environments.file_sync.time.sleep") + @patch("tools.environments.file_sync._sleep") def test_sync_back_retries_on_failure(self, mock_sleep, tmp_path): call_count = 0 @@ -237,7 +237,7 @@ class TestSyncBackRetries: mock_sleep.assert_any_call(_SYNC_BACK_BACKOFF[0]) mock_sleep.assert_any_call(_SYNC_BACK_BACKOFF[1]) - @patch("tools.environments.file_sync.time.sleep") + @patch("tools.environments.file_sync._sleep") def test_sync_back_all_retries_exhausted(self, mock_sleep, tmp_path, caplog): def always_fail(dest: Path): raise RuntimeError("persistent failure") diff --git a/tools/environments/file_sync.py b/tools/environments/file_sync.py index 0a54cbb85d..742e024ad8 100644 --- a/tools/environments/file_sync.py +++ b/tools/environments/file_sync.py @@ -29,6 +29,12 @@ from tools.environments.base import _file_mtime_key logger = logging.getLogger(__name__) +# Keep retry sleeps patchable without mutating the shared stdlib ``time`` +# module. Patching ``tools.environments.file_sync.time.sleep`` replaces +# ``time.sleep`` globally because ``time`` is the module object; under xdist +# that lets unrelated background threads inflate retry-test call counts. +_sleep = time.sleep + _SYNC_INTERVAL_SECONDS = 5.0 _FORCE_SYNC_ENV = "HERMES_FORCE_FILE_SYNC" @@ -243,7 +249,7 @@ class FileSyncManager: "sync_back: attempt %d failed (%s), retrying in %ds", attempt + 1, exc, delay, ) - time.sleep(delay) + _sleep(delay) logger.warning("sync_back: all %d attempts failed: %s", _SYNC_BACK_MAX_RETRIES, last_exc) diff --git a/tools/environments/local.py b/tools/environments/local.py index d419c72c30..3200e63e60 100644 --- a/tools/environments/local.py +++ b/tools/environments/local.py @@ -383,6 +383,36 @@ class LocalEnvironment(BaseEnvironment): def _kill_process(self, proc): """Kill the entire process group (all children).""" + + def _group_alive(pgid: int) -> bool: + try: + # POSIX-only: _IS_WINDOWS is handled before this helper is used. + os.killpg(pgid, 0) + return True + except ProcessLookupError: + return False + except PermissionError: + # The group exists, even if this process cannot signal it. + return True + + def _wait_for_group_exit(pgid: int, timeout: float) -> bool: + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + # Reap the wrapper promptly. A dead but unreaped group leader + # still makes killpg(pgid, 0) report the group as alive. + try: + proc.poll() + except Exception: + pass + if not _group_alive(pgid): + return True + time.sleep(0.05) + try: + proc.poll() + except Exception: + pass + return not _group_alive(pgid) + try: if _IS_WINDOWS: proc.terminate() @@ -393,37 +423,29 @@ class LocalEnvironment(BaseEnvironment): pgid = getattr(proc, "_hermes_pgid", None) if pgid is None: raise - os.killpg(pgid, signal.SIGTERM) - deadline = time.monotonic() + 1.0 - while time.monotonic() < deadline: - if proc.poll() is not None: - try: - os.killpg(pgid, 0) - except ProcessLookupError: - return - time.sleep(0.05) - # The shell can exit quickly while a child in the same process - # group is still shutting down. Escalate based on the process - # group, not just the shell wrapper, so interrupted commands do - # not leave orphaned grandchildren under load. try: - # _IS_WINDOWS is guarded by the enclosing else branch. + os.killpg(pgid, signal.SIGTERM) + except ProcessLookupError: + return + + # Wait on the process group, not just the shell wrapper. Under + # load the wrapper can exit before grandchildren do; returning + # at that point leaves orphaned process-group members behind. + if _wait_for_group_exit(pgid, 1.0): + return + + try: + # POSIX-only: _IS_WINDOWS is handled by the outer branch. os.killpg(pgid, signal.SIGKILL) except ProcessLookupError: return + _wait_for_group_exit(pgid, 2.0) try: - proc.wait(timeout=1.0) - except subprocess.TimeoutExpired: + proc.wait(timeout=0.2) + except (subprocess.TimeoutExpired, OSError): pass - deadline = time.monotonic() + 1.0 - while time.monotonic() < deadline: - try: - os.killpg(pgid, 0) - except ProcessLookupError: - return - time.sleep(0.05) - except (ProcessLookupError, PermissionError): + except (ProcessLookupError, PermissionError, OSError): try: proc.kill() except Exception: