fix(whatsapp): validate bridge PID identity before killing stale pidfile entry

`_kill_stale_bridge_by_pidfile` SIGTERMed the PID recorded in `bridge.pid`
after only a bare liveness check. Once the bridge exits and is reaped the
kernel recycles that PID onto an unrelated process; because the WhatsApp bridge
crash-loops ("Bridge process died (exit code 1)" repeating), this cleanup ran
on every restart and could SIGTERM a recycled PID that had landed on the user's
browser — closing Firefox at irregular intervals with no crash and no coredump
(a clean kill of a stranger).

Same PID-recycling class as the MCP reaper (7bd1f8a2d) and the process-registry
host-PID guard (e6a99cef2); this was the third, and most actively-fired, path.

Fix: `_write_bridge_pidfile` now also records the leader's kernel start time
(line 2). `_kill_stale_bridge_by_pidfile` re-validates identity via
`_bridge_pid_is_ours` before signalling — the (pid, start time) pair must match,
or for legacy single-line pidfiles the live cmdline must name `node` + this
session's unique path. A recycled PID (different start time / cmdline) is logged
and skipped, never signalled. Legacy pidfiles stay readable.

Adds TestWhatsappBridgePidfile: real-process tests proving a genuine bridge is
reaped while a recycled PID (start-time mismatch, or non-bridge cmdline) is
spared. 7 new + 108 gateway/registry tests green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
valentt 2026-06-11 01:29:33 +02:00 committed by Teknium
parent e447723149
commit 77fdbbfe81
2 changed files with 181 additions and 8 deletions

View file

@ -90,33 +90,80 @@ def _kill_port_process(port: int) -> None:
pass
def _bridge_pid_is_ours(pid: int, session_path: Path, expected_start) -> bool:
"""True only if ``pid`` is alive AND still our node bridge for this session.
The PID is read from a file written by a previous run. Once that process
exits and is reaped the kernel can recycle the number onto an unrelated
process observed in the wild landing on a desktop browser's main process,
which a bare-liveness ``os.kill`` then SIGTERMed, closing the whole browser
at irregular intervals (every time the flapping bridge restarted).
Identity is confirmed two ways: the kernel start time captured when we wrote
the pidfile (definitive), and for legacy pidfiles with no baseline the
command line, which must contain ``node`` and this session's unique path.
A recycled PID (different start time / different cmdline) is never ours.
"""
from gateway.status import _pid_exists
if not _pid_exists(pid):
return False
if expected_start is not None:
from gateway.status import get_process_start_time
# A matching (pid, start time) pair uniquely identifies the process.
return get_process_start_time(pid) == expected_start
# Legacy pidfile (no recorded start time): fall back to a command-line
# signature so a recycled PID is still never signalled. If we cannot read
# the cmdline we refuse to kill rather than risk a stranger.
from gateway.status import _read_process_cmdline
cmdline = _read_process_cmdline(pid)
if not cmdline:
return False
return ("node" in cmdline) and (str(session_path) in cmdline)
def _kill_stale_bridge_by_pidfile(session_path: Path) -> None:
"""Kill a bridge process recorded in a PID file from a previous run.
The bridge writes ``bridge.pid`` into the session directory when it
starts. If the gateway crashed without a clean shutdown the old bridge
process becomes orphaned this helper finds and kills it.
Critically, the recorded PID is re-validated against the live process
(:func:`_bridge_pid_is_ours`) before any signal, so a recycled PID that now
names an unrelated process (e.g. the user's browser) is never killed.
"""
pid_file = session_path / "bridge.pid"
if not pid_file.exists():
return
pid = None
recorded_start = None
try:
pid = int(pid_file.read_text().strip())
except (ValueError, OSError, TypeError):
# Format: line 1 = pid, optional line 2 = kernel start time. Legacy
# files written before the guard existed have only the pid.
lines = pid_file.read_text().split("\n")
pid = int(lines[0].strip())
if len(lines) > 1 and lines[1].strip():
recorded_start = int(lines[1].strip())
except (ValueError, OSError, TypeError, IndexError):
try:
pid_file.unlink()
except OSError:
pass
return
# ``os.kill(pid, 0)`` is NOT a no-op on Windows (bpo-14484) — use the
# cross-platform existence check before sending a real signal.
from gateway.status import _pid_exists
if _pid_exists(pid):
if _bridge_pid_is_ours(pid, session_path, recorded_start):
try:
os.kill(pid, signal.SIGTERM)
logger.info("[whatsapp] Killed stale bridge PID %d from pidfile", pid)
except (ProcessLookupError, PermissionError, OSError):
pass
else:
from gateway.status import _pid_exists
if _pid_exists(pid):
logger.warning(
"[whatsapp] Not killing pidfile PID %d: it is no longer the "
"bridge (recycled onto an unrelated process); skipping to avoid "
"killing a stranger.", pid,
)
try:
pid_file.unlink()
except OSError:
@ -124,9 +171,17 @@ def _kill_stale_bridge_by_pidfile(session_path: Path) -> None:
def _write_bridge_pidfile(session_path: Path, pid: int) -> None:
"""Write the bridge PID to a file for later cleanup."""
"""Write the bridge PID (and its kernel start time) for later cleanup.
The start time on line 2 lets a future run prove the PID still names this
exact process before signalling it, so a recycled PID can never be killed
as a "stale bridge". Older single-line files remain readable.
"""
try:
(session_path / "bridge.pid").write_text(str(pid))
from gateway.status import get_process_start_time
start = get_process_start_time(pid)
text = str(pid) if start is None else "{}\n{}".format(pid, start)
(session_path / "bridge.pid").write_text(text)
except OSError:
pass

View file

@ -0,0 +1,118 @@
"""Regression tests: the WhatsApp stale-bridge cleanup must never kill a stranger.
The bridge records its PID in ``bridge.pid``. On the next start the gateway
SIGTERMs that PID to reap an orphaned bridge. The original code checked only
that the PID was *alive* but once the bridge exits and is reaped the kernel
can recycle its number onto an unrelated process. Because the WhatsApp bridge
crash-loops, this cleanup ran constantly, and a recycled PID that had landed on
the user's browser main process got SIGTERMed, closing the browser at irregular
intervals (no crash, no coredump a clean kill of a stranger).
These tests prove the identity guard: a PID is only signalled when it is still
our bridge (kernel start time matches, or for legacy pidfiles its command
line names node + this session). A recycled PID is left alone.
"""
import subprocess
import sys
import time
import pytest
from gateway.platforms.whatsapp import (
_bridge_pid_is_ours,
_kill_stale_bridge_by_pidfile,
_write_bridge_pidfile,
)
from gateway.status import get_process_start_time
def _spawn_sleeper(*extra_argv) -> subprocess.Popen:
"""Spawn a real, short-lived process; optional extra argv shapes its cmdline."""
return subprocess.Popen(
[sys.executable, "-c", "import time; time.sleep(30)", *extra_argv]
)
def _wait_dead(proc: subprocess.Popen, timeout: float = 5.0) -> bool:
deadline = time.monotonic() + timeout
while time.monotonic() < deadline:
if proc.poll() is not None:
return True
time.sleep(0.05)
return False
class TestWriteAndRoundTrip:
def test_pidfile_records_pid_and_start_time(self, tmp_path):
proc = _spawn_sleeper()
try:
_write_bridge_pidfile(tmp_path, proc.pid)
lines = (tmp_path / "bridge.pid").read_text().split("\n")
assert int(lines[0]) == proc.pid
# Line 2 is the kernel start time (present on Linux).
assert int(lines[1]) == get_process_start_time(proc.pid)
finally:
proc.kill()
proc.wait()
class TestIdentityGuard:
def test_kills_when_start_time_matches(self, tmp_path):
"""A genuine bridge (recorded start time matches) IS reaped."""
proc = _spawn_sleeper()
try:
_write_bridge_pidfile(tmp_path, proc.pid)
_kill_stale_bridge_by_pidfile(tmp_path)
assert _wait_dead(proc), "the real bridge process should be killed"
assert not (tmp_path / "bridge.pid").exists()
finally:
if proc.poll() is None:
proc.kill()
proc.wait()
def test_spares_recycled_pid_start_time_mismatch(self, tmp_path):
"""Alive PID whose start time changed (recycled) is NOT signalled."""
proc = _spawn_sleeper()
try:
real_start = get_process_start_time(proc.pid)
# Pidfile claims a different start time -> simulates a recycled PID.
(tmp_path / "bridge.pid").write_text("{}\n{}".format(proc.pid, real_start + 1))
_kill_stale_bridge_by_pidfile(tmp_path)
assert not _wait_dead(proc, timeout=1.0), "recycled PID must survive"
assert proc.poll() is None
finally:
proc.kill()
proc.wait()
def test_legacy_pidfile_spares_non_bridge_cmdline(self, tmp_path):
"""Legacy pidfile (pid only): a PID that isn't node+session is spared."""
proc = _spawn_sleeper() # cmdline is just python -c ... — not a bridge
try:
(tmp_path / "bridge.pid").write_text(str(proc.pid)) # legacy: pid only
_kill_stale_bridge_by_pidfile(tmp_path)
assert not _wait_dead(proc, timeout=1.0), "stranger must survive"
assert proc.poll() is None
finally:
proc.kill()
proc.wait()
def test_legacy_pidfile_kills_matching_bridge_cmdline(self, tmp_path):
"""Legacy pidfile: a PID whose cmdline names node + session IS reaped."""
# Shape the cmdline to look like the node bridge for this session.
proc = _spawn_sleeper("node", str(tmp_path))
try:
(tmp_path / "bridge.pid").write_text(str(proc.pid)) # legacy: pid only
_kill_stale_bridge_by_pidfile(tmp_path)
assert _wait_dead(proc), "a cmdline-confirmed bridge should be killed"
finally:
if proc.poll() is None:
proc.kill()
proc.wait()
def test_is_ours_false_for_dead_pid(self, tmp_path):
assert _bridge_pid_is_ours(999999999, tmp_path, None) is False
def test_missing_pidfile_is_noop(self, tmp_path):
# No file -> must not raise.
_kill_stale_bridge_by_pidfile(tmp_path)