From 77fdbbfe81d87fb04feee4339bea2f830be80b94 Mon Sep 17 00:00:00 2001 From: valentt Date: Thu, 11 Jun 2026 01:29:33 +0200 Subject: [PATCH] fix(whatsapp): validate bridge PID identity before killing stale pidfile entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_kill_stale_bridge_by_pidfile` SIGTERMed the PID recorded in `bridge.pid` after only a bare liveness check. Once the bridge exits and is reaped the kernel recycles that PID onto an unrelated process; because the WhatsApp bridge crash-loops ("Bridge process died (exit code 1)" repeating), this cleanup ran on every restart and could SIGTERM a recycled PID that had landed on the user's browser — closing Firefox at irregular intervals with no crash and no coredump (a clean kill of a stranger). Same PID-recycling class as the MCP reaper (7bd1f8a2d) and the process-registry host-PID guard (e6a99cef2); this was the third, and most actively-fired, path. Fix: `_write_bridge_pidfile` now also records the leader's kernel start time (line 2). `_kill_stale_bridge_by_pidfile` re-validates identity via `_bridge_pid_is_ours` before signalling — the (pid, start time) pair must match, or for legacy single-line pidfiles the live cmdline must name `node` + this session's unique path. A recycled PID (different start time / cmdline) is logged and skipped, never signalled. Legacy pidfiles stay readable. Adds TestWhatsappBridgePidfile: real-process tests proving a genuine bridge is reaped while a recycled PID (start-time mismatch, or non-bridge cmdline) is spared. 7 new + 108 gateway/registry tests green. Co-Authored-By: Claude Opus 4.8 (1M context) --- plugins/platforms/whatsapp/adapter.py | 71 +++++++++-- tests/gateway/test_whatsapp_bridge_pidfile.py | 118 ++++++++++++++++++ 2 files changed, 181 insertions(+), 8 deletions(-) create mode 100644 tests/gateway/test_whatsapp_bridge_pidfile.py diff --git a/plugins/platforms/whatsapp/adapter.py b/plugins/platforms/whatsapp/adapter.py index 239b386ca3d..4526e31278c 100644 --- a/plugins/platforms/whatsapp/adapter.py +++ b/plugins/platforms/whatsapp/adapter.py @@ -90,33 +90,80 @@ def _kill_port_process(port: int) -> None: pass +def _bridge_pid_is_ours(pid: int, session_path: Path, expected_start) -> bool: + """True only if ``pid`` is alive AND still our node bridge for this session. + + The PID is read from a file written by a previous run. Once that process + exits and is reaped the kernel can recycle the number onto an unrelated + process — observed in the wild landing on a desktop browser's main process, + which a bare-liveness ``os.kill`` then SIGTERMed, closing the whole browser + at irregular intervals (every time the flapping bridge restarted). + + Identity is confirmed two ways: the kernel start time captured when we wrote + the pidfile (definitive), and — for legacy pidfiles with no baseline — the + command line, which must contain ``node`` and this session's unique path. + A recycled PID (different start time / different cmdline) is never ours. + """ + from gateway.status import _pid_exists + if not _pid_exists(pid): + return False + if expected_start is not None: + from gateway.status import get_process_start_time + # A matching (pid, start time) pair uniquely identifies the process. + return get_process_start_time(pid) == expected_start + # Legacy pidfile (no recorded start time): fall back to a command-line + # signature so a recycled PID is still never signalled. If we cannot read + # the cmdline we refuse to kill rather than risk a stranger. + from gateway.status import _read_process_cmdline + cmdline = _read_process_cmdline(pid) + if not cmdline: + return False + return ("node" in cmdline) and (str(session_path) in cmdline) + + def _kill_stale_bridge_by_pidfile(session_path: Path) -> None: """Kill a bridge process recorded in a PID file from a previous run. The bridge writes ``bridge.pid`` into the session directory when it starts. If the gateway crashed without a clean shutdown the old bridge process becomes orphaned — this helper finds and kills it. + + Critically, the recorded PID is re-validated against the live process + (:func:`_bridge_pid_is_ours`) before any signal, so a recycled PID that now + names an unrelated process (e.g. the user's browser) is never killed. """ pid_file = session_path / "bridge.pid" if not pid_file.exists(): return + pid = None + recorded_start = None try: - pid = int(pid_file.read_text().strip()) - except (ValueError, OSError, TypeError): + # Format: line 1 = pid, optional line 2 = kernel start time. Legacy + # files written before the guard existed have only the pid. + lines = pid_file.read_text().split("\n") + pid = int(lines[0].strip()) + if len(lines) > 1 and lines[1].strip(): + recorded_start = int(lines[1].strip()) + except (ValueError, OSError, TypeError, IndexError): try: pid_file.unlink() except OSError: pass return - # ``os.kill(pid, 0)`` is NOT a no-op on Windows (bpo-14484) — use the - # cross-platform existence check before sending a real signal. - from gateway.status import _pid_exists - if _pid_exists(pid): + if _bridge_pid_is_ours(pid, session_path, recorded_start): try: os.kill(pid, signal.SIGTERM) logger.info("[whatsapp] Killed stale bridge PID %d from pidfile", pid) except (ProcessLookupError, PermissionError, OSError): pass + else: + from gateway.status import _pid_exists + if _pid_exists(pid): + logger.warning( + "[whatsapp] Not killing pidfile PID %d: it is no longer the " + "bridge (recycled onto an unrelated process); skipping to avoid " + "killing a stranger.", pid, + ) try: pid_file.unlink() except OSError: @@ -124,9 +171,17 @@ def _kill_stale_bridge_by_pidfile(session_path: Path) -> None: def _write_bridge_pidfile(session_path: Path, pid: int) -> None: - """Write the bridge PID to a file for later cleanup.""" + """Write the bridge PID (and its kernel start time) for later cleanup. + + The start time on line 2 lets a future run prove the PID still names this + exact process before signalling it, so a recycled PID can never be killed + as a "stale bridge". Older single-line files remain readable. + """ try: - (session_path / "bridge.pid").write_text(str(pid)) + from gateway.status import get_process_start_time + start = get_process_start_time(pid) + text = str(pid) if start is None else "{}\n{}".format(pid, start) + (session_path / "bridge.pid").write_text(text) except OSError: pass diff --git a/tests/gateway/test_whatsapp_bridge_pidfile.py b/tests/gateway/test_whatsapp_bridge_pidfile.py new file mode 100644 index 00000000000..0e43b621fa8 --- /dev/null +++ b/tests/gateway/test_whatsapp_bridge_pidfile.py @@ -0,0 +1,118 @@ +"""Regression tests: the WhatsApp stale-bridge cleanup must never kill a stranger. + +The bridge records its PID in ``bridge.pid``. On the next start the gateway +SIGTERMs that PID to reap an orphaned bridge. The original code checked only +that the PID was *alive* — but once the bridge exits and is reaped the kernel +can recycle its number onto an unrelated process. Because the WhatsApp bridge +crash-loops, this cleanup ran constantly, and a recycled PID that had landed on +the user's browser main process got SIGTERMed, closing the browser at irregular +intervals (no crash, no coredump — a clean kill of a stranger). + +These tests prove the identity guard: a PID is only signalled when it is still +our bridge (kernel start time matches, or — for legacy pidfiles — its command +line names node + this session). A recycled PID is left alone. +""" + +import subprocess +import sys +import time + +import pytest + +from gateway.platforms.whatsapp import ( + _bridge_pid_is_ours, + _kill_stale_bridge_by_pidfile, + _write_bridge_pidfile, +) +from gateway.status import get_process_start_time + + +def _spawn_sleeper(*extra_argv) -> subprocess.Popen: + """Spawn a real, short-lived process; optional extra argv shapes its cmdline.""" + return subprocess.Popen( + [sys.executable, "-c", "import time; time.sleep(30)", *extra_argv] + ) + + +def _wait_dead(proc: subprocess.Popen, timeout: float = 5.0) -> bool: + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if proc.poll() is not None: + return True + time.sleep(0.05) + return False + + +class TestWriteAndRoundTrip: + def test_pidfile_records_pid_and_start_time(self, tmp_path): + proc = _spawn_sleeper() + try: + _write_bridge_pidfile(tmp_path, proc.pid) + lines = (tmp_path / "bridge.pid").read_text().split("\n") + assert int(lines[0]) == proc.pid + # Line 2 is the kernel start time (present on Linux). + assert int(lines[1]) == get_process_start_time(proc.pid) + finally: + proc.kill() + proc.wait() + + +class TestIdentityGuard: + def test_kills_when_start_time_matches(self, tmp_path): + """A genuine bridge (recorded start time matches) IS reaped.""" + proc = _spawn_sleeper() + try: + _write_bridge_pidfile(tmp_path, proc.pid) + _kill_stale_bridge_by_pidfile(tmp_path) + assert _wait_dead(proc), "the real bridge process should be killed" + assert not (tmp_path / "bridge.pid").exists() + finally: + if proc.poll() is None: + proc.kill() + proc.wait() + + def test_spares_recycled_pid_start_time_mismatch(self, tmp_path): + """Alive PID whose start time changed (recycled) is NOT signalled.""" + proc = _spawn_sleeper() + try: + real_start = get_process_start_time(proc.pid) + # Pidfile claims a different start time -> simulates a recycled PID. + (tmp_path / "bridge.pid").write_text("{}\n{}".format(proc.pid, real_start + 1)) + _kill_stale_bridge_by_pidfile(tmp_path) + assert not _wait_dead(proc, timeout=1.0), "recycled PID must survive" + assert proc.poll() is None + finally: + proc.kill() + proc.wait() + + def test_legacy_pidfile_spares_non_bridge_cmdline(self, tmp_path): + """Legacy pidfile (pid only): a PID that isn't node+session is spared.""" + proc = _spawn_sleeper() # cmdline is just python -c ... — not a bridge + try: + (tmp_path / "bridge.pid").write_text(str(proc.pid)) # legacy: pid only + _kill_stale_bridge_by_pidfile(tmp_path) + assert not _wait_dead(proc, timeout=1.0), "stranger must survive" + assert proc.poll() is None + finally: + proc.kill() + proc.wait() + + def test_legacy_pidfile_kills_matching_bridge_cmdline(self, tmp_path): + """Legacy pidfile: a PID whose cmdline names node + session IS reaped.""" + # Shape the cmdline to look like the node bridge for this session. + proc = _spawn_sleeper("node", str(tmp_path)) + try: + (tmp_path / "bridge.pid").write_text(str(proc.pid)) # legacy: pid only + _kill_stale_bridge_by_pidfile(tmp_path) + assert _wait_dead(proc), "a cmdline-confirmed bridge should be killed" + finally: + if proc.poll() is None: + proc.kill() + proc.wait() + + def test_is_ours_false_for_dead_pid(self, tmp_path): + assert _bridge_pid_is_ours(999999999, tmp_path, None) is False + + def test_missing_pidfile_is_noop(self, tmp_path): + # No file -> must not raise. + _kill_stale_bridge_by_pidfile(tmp_path)