fix(ci): stabilize main test suite regressions (#17660)

* fix: stabilize main test suite regressions

* test(agent): update MiniMax normalization expectation

* test: stabilize remaining CI assertions

* test: harden config helper monkeypatching

* test: harden CI-only assertions

* fix(agent): propagate fast streaming interrupts
This commit is contained in:
Stephen Schoettler 2026-04-29 23:18:55 -07:00 committed by GitHub
parent e7beaaf184
commit f73364b1c4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
37 changed files with 450 additions and 127 deletions

View file

@ -16,6 +16,7 @@ import signal
import subprocess
import threading
import time
from types import SimpleNamespace
import pytest
@ -37,6 +38,58 @@ def _pgid_still_alive(pgid: int) -> bool:
return False
def _process_group_snapshot(pgid: int) -> str:
"""Return a process-table snapshot for diagnostics."""
return subprocess.run(
["ps", "-o", "pid,ppid,pgid,stat,cmd", "-g", str(pgid)],
capture_output=True,
text=True,
check=False,
).stdout.strip()
def _wait_for_pgid_exit(pgid: int, timeout: float = 10.0) -> bool:
"""Wait for a process group to disappear under loaded xdist hosts."""
deadline = time.monotonic() + timeout
while time.monotonic() < deadline:
if not _pgid_still_alive(pgid):
return True
time.sleep(0.1)
return not _pgid_still_alive(pgid)
def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
"""If the shell wrapper exits before cleanup, still kill its process group.
Without the cached pgid fallback, ``os.getpgid(proc.pid)`` raises for the
dead wrapper and cleanup falls back to ``proc.kill()``, which cannot reach
orphaned grandchildren still running in the original process group.
"""
env = object.__new__(LocalEnvironment)
proc = SimpleNamespace(
pid=12345,
_hermes_pgid=67890,
poll=lambda: 0,
kill=lambda: None,
)
killpg_calls = []
def fake_getpgid(_pid):
raise ProcessLookupError
def fake_killpg(pgid, sig):
killpg_calls.append((pgid, sig))
if sig == 0:
raise ProcessLookupError
monkeypatch.setattr(os, "getpgid", fake_getpgid)
monkeypatch.setattr(os, "killpg", fake_killpg)
env._kill_process(proc)
assert killpg_calls == [(67890, signal.SIGTERM), (67890, 0)]
def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
"""When KeyboardInterrupt arrives mid-poll, the subprocess group must be
killed before the exception is re-raised."""
@ -118,19 +171,15 @@ def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
assert not t.is_alive(), "worker didn't exit within 5 s of the interrupt"
# The critical assertion: the subprocess GROUP must be dead. Not
# just the bash wrapper — the 'sleep 30' child too.
# Give the SIGTERM+1s wait+SIGKILL escalation a moment to complete.
deadline = time.monotonic() + 3.0
while time.monotonic() < deadline:
if not _pgid_still_alive(pgid):
break
time.sleep(0.1)
assert not _pgid_still_alive(pgid), (
# just the bash wrapper — the 'sleep 30' child too. Under xdist load,
# process-group disappearance can lag briefly after the worker exits,
# especially if the process is already dying or waiting to be reaped.
assert _wait_for_pgid_exit(pgid), (
f"subprocess group {pgid} is STILL ALIVE after worker received "
f"KeyboardInterrupt — orphan bug regressed. This is the "
f"sleep-300-survives-SIGTERM scenario from Physikal's Apr 2026 "
f"report. See tools/environments/base.py _wait_for_process "
f"except-block."
f"except-block.\n{_process_group_snapshot(pgid)}"
)
# And the worker should have observed the KeyboardInterrupt (i.e.
# it re-raised cleanly, not silently swallowed).