mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-25 11:02:03 +00:00
fix(whatsapp): validate bridge PID identity before killing stale pidfile entry
`_kill_stale_bridge_by_pidfile` SIGTERMed the PID recorded in `bridge.pid`
after only a bare liveness check. Once the bridge exits and is reaped the
kernel recycles that PID onto an unrelated process; because the WhatsApp bridge
crash-loops ("Bridge process died (exit code 1)" repeating), this cleanup ran
on every restart and could SIGTERM a recycled PID that had landed on the user's
browser — closing Firefox at irregular intervals with no crash and no coredump
(a clean kill of a stranger).
Same PID-recycling class as the MCP reaper (7bd1f8a2d) and the process-registry
host-PID guard (e6a99cef2); this was the third, and most actively-fired, path.
Fix: `_write_bridge_pidfile` now also records the leader's kernel start time
(line 2). `_kill_stale_bridge_by_pidfile` re-validates identity via
`_bridge_pid_is_ours` before signalling — the (pid, start time) pair must match,
or for legacy single-line pidfiles the live cmdline must name `node` + this
session's unique path. A recycled PID (different start time / cmdline) is logged
and skipped, never signalled. Legacy pidfiles stay readable.
Adds TestWhatsappBridgePidfile: real-process tests proving a genuine bridge is
reaped while a recycled PID (start-time mismatch, or non-bridge cmdline) is
spared. 7 new + 108 gateway/registry tests green.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
e447723149
commit
77fdbbfe81
2 changed files with 181 additions and 8 deletions
118
tests/gateway/test_whatsapp_bridge_pidfile.py
Normal file
118
tests/gateway/test_whatsapp_bridge_pidfile.py
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
"""Regression tests: the WhatsApp stale-bridge cleanup must never kill a stranger.
|
||||
|
||||
The bridge records its PID in ``bridge.pid``. On the next start the gateway
|
||||
SIGTERMs that PID to reap an orphaned bridge. The original code checked only
|
||||
that the PID was *alive* — but once the bridge exits and is reaped the kernel
|
||||
can recycle its number onto an unrelated process. Because the WhatsApp bridge
|
||||
crash-loops, this cleanup ran constantly, and a recycled PID that had landed on
|
||||
the user's browser main process got SIGTERMed, closing the browser at irregular
|
||||
intervals (no crash, no coredump — a clean kill of a stranger).
|
||||
|
||||
These tests prove the identity guard: a PID is only signalled when it is still
|
||||
our bridge (kernel start time matches, or — for legacy pidfiles — its command
|
||||
line names node + this session). A recycled PID is left alone.
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
from gateway.platforms.whatsapp import (
|
||||
_bridge_pid_is_ours,
|
||||
_kill_stale_bridge_by_pidfile,
|
||||
_write_bridge_pidfile,
|
||||
)
|
||||
from gateway.status import get_process_start_time
|
||||
|
||||
|
||||
def _spawn_sleeper(*extra_argv) -> subprocess.Popen:
|
||||
"""Spawn a real, short-lived process; optional extra argv shapes its cmdline."""
|
||||
return subprocess.Popen(
|
||||
[sys.executable, "-c", "import time; time.sleep(30)", *extra_argv]
|
||||
)
|
||||
|
||||
|
||||
def _wait_dead(proc: subprocess.Popen, timeout: float = 5.0) -> bool:
|
||||
deadline = time.monotonic() + timeout
|
||||
while time.monotonic() < deadline:
|
||||
if proc.poll() is not None:
|
||||
return True
|
||||
time.sleep(0.05)
|
||||
return False
|
||||
|
||||
|
||||
class TestWriteAndRoundTrip:
|
||||
def test_pidfile_records_pid_and_start_time(self, tmp_path):
|
||||
proc = _spawn_sleeper()
|
||||
try:
|
||||
_write_bridge_pidfile(tmp_path, proc.pid)
|
||||
lines = (tmp_path / "bridge.pid").read_text().split("\n")
|
||||
assert int(lines[0]) == proc.pid
|
||||
# Line 2 is the kernel start time (present on Linux).
|
||||
assert int(lines[1]) == get_process_start_time(proc.pid)
|
||||
finally:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
|
||||
class TestIdentityGuard:
|
||||
def test_kills_when_start_time_matches(self, tmp_path):
|
||||
"""A genuine bridge (recorded start time matches) IS reaped."""
|
||||
proc = _spawn_sleeper()
|
||||
try:
|
||||
_write_bridge_pidfile(tmp_path, proc.pid)
|
||||
_kill_stale_bridge_by_pidfile(tmp_path)
|
||||
assert _wait_dead(proc), "the real bridge process should be killed"
|
||||
assert not (tmp_path / "bridge.pid").exists()
|
||||
finally:
|
||||
if proc.poll() is None:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
def test_spares_recycled_pid_start_time_mismatch(self, tmp_path):
|
||||
"""Alive PID whose start time changed (recycled) is NOT signalled."""
|
||||
proc = _spawn_sleeper()
|
||||
try:
|
||||
real_start = get_process_start_time(proc.pid)
|
||||
# Pidfile claims a different start time -> simulates a recycled PID.
|
||||
(tmp_path / "bridge.pid").write_text("{}\n{}".format(proc.pid, real_start + 1))
|
||||
_kill_stale_bridge_by_pidfile(tmp_path)
|
||||
assert not _wait_dead(proc, timeout=1.0), "recycled PID must survive"
|
||||
assert proc.poll() is None
|
||||
finally:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
def test_legacy_pidfile_spares_non_bridge_cmdline(self, tmp_path):
|
||||
"""Legacy pidfile (pid only): a PID that isn't node+session is spared."""
|
||||
proc = _spawn_sleeper() # cmdline is just python -c ... — not a bridge
|
||||
try:
|
||||
(tmp_path / "bridge.pid").write_text(str(proc.pid)) # legacy: pid only
|
||||
_kill_stale_bridge_by_pidfile(tmp_path)
|
||||
assert not _wait_dead(proc, timeout=1.0), "stranger must survive"
|
||||
assert proc.poll() is None
|
||||
finally:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
def test_legacy_pidfile_kills_matching_bridge_cmdline(self, tmp_path):
|
||||
"""Legacy pidfile: a PID whose cmdline names node + session IS reaped."""
|
||||
# Shape the cmdline to look like the node bridge for this session.
|
||||
proc = _spawn_sleeper("node", str(tmp_path))
|
||||
try:
|
||||
(tmp_path / "bridge.pid").write_text(str(proc.pid)) # legacy: pid only
|
||||
_kill_stale_bridge_by_pidfile(tmp_path)
|
||||
assert _wait_dead(proc), "a cmdline-confirmed bridge should be killed"
|
||||
finally:
|
||||
if proc.poll() is None:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
def test_is_ours_false_for_dead_pid(self, tmp_path):
|
||||
assert _bridge_pid_is_ours(999999999, tmp_path, None) is False
|
||||
|
||||
def test_missing_pidfile_is_noop(self, tmp_path):
|
||||
# No file -> must not raise.
|
||||
_kill_stale_bridge_by_pidfile(tmp_path)
|
||||
Loading…
Add table
Add a link
Reference in a new issue