fix: reap zombie subprocesses in web_server action status and meet_bot cleanup

- web_server.py: after proc.poll() returns a non-None exit code, call
  proc.wait() to reap the child and move the entry from _ACTION_PROCS
  to _ACTION_RESULTS. Previously .poll() alone left <defunct> zombies.
- meet_bot.py: terminate and wait on the pcm_pump subprocess (paplay/
  ffmpeg) during the finally-block teardown. Previously leaked on every
  normal bot exit.
- tests: add test_action_status_reaps_completed_process and
  test_action_status_ignores_wait_failure covering both the happy path
  and the wait()-raises-OSError edge case.

Closes #38032
This commit is contained in:
islam666 2026-06-03 08:37:09 +00:00 committed by Teknium
parent e53b74c394
commit 78e2101cd2
3 changed files with 77 additions and 1 deletions

View file

@ -1691,6 +1691,13 @@ async def get_action_status(name: str, lines: int = 200):
exit_code = proc.poll()
running = exit_code is None
pid = proc.pid
if exit_code is not None:
try:
proc.wait(timeout=1)
except Exception:
pass
_ACTION_RESULTS[name] = {"exit_code": exit_code, "pid": pid}
_ACTION_PROCS.pop(name, None)
return {
"name": name,

View file

@ -699,7 +699,13 @@ def run_bot() -> int: # noqa: C901 — orchestration, explicit branches
context.close()
browser.close()
# v2: teardown realtime speaker + audio bridge.
# v2: teardown PCM pump, speaker thread, and audio bridge.
if rt.get("pcm_pump"):
try:
rt["pcm_pump"].terminate()
rt["pcm_pump"].wait(timeout=3)
except Exception:
pass
if rt["speaker_stop"]:
try:
rt["speaker_stop"]()

View file

@ -823,6 +823,69 @@ class TestWebServerEndpoints:
assert resp.json() == {"ok": True, "pid": 12345, "name": "hermes-update"}
assert calls == [(["update"], "hermes-update")]
def test_action_status_reaps_completed_process(self, monkeypatch):
import hermes_cli.web_server as web_server
waited = {"done": False}
class _Proc:
pid = 42424
def poll(self):
return 0
def wait(self, timeout=None):
waited["done"] = True
proc = _Proc()
web_server._ACTION_PROCS.pop("hermes-update", None)
web_server._ACTION_RESULTS.pop("hermes-update", None)
web_server._ACTION_PROCS["hermes-update"] = proc
resp = self.client.get("/api/actions/hermes-update/status")
assert resp.status_code == 200
data = resp.json()
assert data["running"] is False
assert data["exit_code"] == 0
assert data["pid"] == 42424
# Process should have been reaped and moved to results.
assert waited["done"] is True
assert "hermes-update" not in web_server._ACTION_PROCS
assert web_server._ACTION_RESULTS["hermes-update"] == {
"exit_code": 0,
"pid": 42424,
}
def test_action_status_ignores_wait_failure(self, monkeypatch):
import hermes_cli.web_server as web_server
class _Proc:
pid = 99
def poll(self):
return 1
def wait(self, timeout=None):
raise OSError("already reaped")
proc = _Proc()
web_server._ACTION_PROCS.pop("hermes-update", None)
web_server._ACTION_RESULTS.pop("hermes-update", None)
web_server._ACTION_PROCS["hermes-update"] = proc
resp = self.client.get("/api/actions/hermes-update/status")
assert resp.status_code == 200
data = resp.json()
assert data["exit_code"] == 1
# Still reaped despite wait() raising.
assert "hermes-update" not in web_server._ACTION_PROCS
assert web_server._ACTION_RESULTS["hermes-update"] == {
"exit_code": 1,
"pid": 99,
}
def test_get_status_filters_unconfigured_gateway_platforms(self, monkeypatch):
import gateway.config as gateway_config
import hermes_cli.web_server as web_server