fix(process_registry): kill orphaned Popen on post-spawn setup failure

After Popen succeeds with os.setsid (detached process group), 5 things
happen with no try/except: Thread construction, reader.start(), lock
acquisition, prune+register, checkpoint write. If any raises, the
Popen object goes unregistered and the detached process group leaks
indefinitely.

Wrap the post-spawn setup in try/except. On failure:
  - os.killpg(getpgid(pid), SIGKILL) takes down the entire process
    group (not just the shell - important because of detached PG +
    -lic shell wrapper that may have spawned children)
  - proc.kill() fallback for ProcessLookupError/PermissionError/OSError
  - proc.wait(timeout=5) reaps with a bound
  - re-raise to preserve original traceback
Nested try/except around cleanup so a secondary failure can't mask the
original.

Closes #2749.
This commit is contained in:
Wesley Simplicio 2026-05-09 14:54:25 -07:00 committed by Teknium
parent c179bdab3c
commit 53ec32819c
2 changed files with 124 additions and 13 deletions

View file

@ -530,6 +530,96 @@ class TestSpawnEnvSanitization:
assert env.commands[2][0] == "cat '/path with spaces/hermes_bg.exit' 2>/dev/null"
# =========================================================================
# Popen leak prevention
# =========================================================================
class TestPopenLeakOnSetupFailure:
"""Regression for issue #2749: subprocess orphaned when post-Popen setup raises."""
def test_popen_killed_when_thread_creation_fails(self, registry):
"""If Thread() raises after Popen, proc must be killed — not orphaned."""
killed = []
proc = MagicMock()
proc.pid = 9999
proc.stdout = iter([])
proc.stdin = MagicMock()
proc.poll.return_value = None
def fake_kill():
killed.append(True)
proc.kill = fake_kill
proc.wait = MagicMock()
def boom(*args, **kwargs):
raise RuntimeError("Thread creation failed")
with patch("tools.process_registry._find_shell", return_value="/bin/bash"), \
patch("subprocess.Popen", return_value=proc), \
patch("threading.Thread", side_effect=boom), \
patch.object(registry, "_write_checkpoint"):
with pytest.raises(RuntimeError, match="Thread creation failed"):
registry.spawn_local("echo hello", cwd="/tmp")
assert killed, "proc.kill() must be called when post-Popen setup raises"
def test_popen_killed_when_write_checkpoint_fails(self, registry):
"""If _write_checkpoint raises after Popen, proc must still be killed."""
killed = []
proc = MagicMock()
proc.pid = 8888
proc.stdout = iter([])
proc.stdin = MagicMock()
proc.poll.return_value = None
def fake_kill():
killed.append(True)
proc.kill = fake_kill
proc.wait = MagicMock()
fake_thread = MagicMock()
with patch("tools.process_registry._find_shell", return_value="/bin/bash"), \
patch("subprocess.Popen", return_value=proc), \
patch("threading.Thread", return_value=fake_thread), \
patch.object(registry, "_write_checkpoint", side_effect=OSError("disk full")):
with pytest.raises(OSError, match="disk full"):
registry.spawn_local("echo hello", cwd="/tmp")
assert killed, "proc.kill() must be called when _write_checkpoint raises"
def test_popen_not_killed_on_success(self, registry):
"""Successful spawn must NOT kill the process."""
killed = []
proc = MagicMock()
proc.pid = 7777
proc.stdout = iter([])
proc.stdin = MagicMock()
proc.poll.return_value = None
def fake_kill():
killed.append(True)
proc.kill = fake_kill
proc.wait = MagicMock()
fake_thread = MagicMock()
with patch("tools.process_registry._find_shell", return_value="/bin/bash"), \
patch("subprocess.Popen", return_value=proc), \
patch("threading.Thread", return_value=fake_thread), \
patch.object(registry, "_write_checkpoint"):
session = registry.spawn_local("echo hello", cwd="/tmp")
assert not killed, "proc.kill() must NOT be called on successful spawn"
assert session.pid == 7777
# =========================================================================
# Checkpoint
# =========================================================================