diff --git a/plugins/platforms/whatsapp/adapter.py b/plugins/platforms/whatsapp/adapter.py index 4526e31278c..94ba3064b4e 100644 --- a/plugins/platforms/whatsapp/adapter.py +++ b/plugins/platforms/whatsapp/adapter.py @@ -35,8 +35,46 @@ from hermes_constants import ( logger = logging.getLogger(__name__) +def _listener_pids_on_port(port: int) -> list: + """PIDs of processes *listening* on ``port`` (POSIX) — never clients. + + This must match only LISTEN sockets. A bare ``lsof -i :PORT`` (or + ``fuser PORT/tcp``) also returns *clients* whose connection merely involves + that port number — e.g. a browser with a tab open on a local dev server + sharing the port. SIGTERMing those closed the user's browser at irregular + intervals. Restricting to LISTEN state frees the port for a new bridge + without ever touching an unrelated client. + """ + pids: list = [] + try: + result = subprocess.run( + ["lsof", "-ti", f"tcp:{port}", "-sTCP:LISTEN"], + capture_output=True, text=True, timeout=5, + ) + for line in result.stdout.strip().splitlines(): + try: + pids.append(int(line)) + except ValueError: + pass + if pids: + return pids + except FileNotFoundError: + pass # lsof not installed — fall through to ss + # Fallback: ss (iproute2, present on virtually every modern Linux). + try: + result = subprocess.run( + ["ss", "-ltnHp", f"sport = :{port}"], + capture_output=True, text=True, timeout=5, + ) + for m in re.finditer(r"pid=(\d+)", result.stdout): + pids.append(int(m.group(1))) + except FileNotFoundError: + pass + return pids + + def _kill_port_process(port: int) -> None: - """Kill any process listening on the given TCP port.""" + """Kill any process *listening* on the given TCP port (a stale bridge).""" try: if _IS_WINDOWS: # Use netstat to find the PID bound to this port, then taskkill @@ -57,35 +95,14 @@ def _kill_port_process(port: int) -> None: except subprocess.SubprocessError: pass else: - # Try fuser first (Linux), fall back to lsof (macOS / WSL2) - killed = False - try: - result = subprocess.run( - ["fuser", f"{port}/tcp"], - capture_output=True, timeout=5, - ) - if result.returncode == 0: - subprocess.run( - ["fuser", "-k", f"{port}/tcp"], - capture_output=True, timeout=5, - ) - killed = True - except FileNotFoundError: - pass # fuser not installed - - if not killed: + # POSIX: only ever signal a process LISTENING on the port. A client + # whose connection happens to involve this port number (a browser + # tab on a local dev server, etc.) must never be killed. + for pid in _listener_pids_on_port(port): try: - result = subprocess.run( - ["lsof", "-ti", f":{port}"], - capture_output=True, text=True, timeout=5, - ) - for pid_str in result.stdout.strip().splitlines(): - try: - os.kill(int(pid_str), signal.SIGTERM) - except (ValueError, ProcessLookupError, PermissionError): - pass - except FileNotFoundError: - pass # lsof not installed either + os.kill(pid, signal.SIGTERM) + except (ProcessLookupError, PermissionError, OSError): + pass except Exception: pass diff --git a/tests/gateway/test_whatsapp_bridge_pidfile.py b/tests/gateway/test_whatsapp_bridge_pidfile.py index 0e43b621fa8..b25a7d30faf 100644 --- a/tests/gateway/test_whatsapp_bridge_pidfile.py +++ b/tests/gateway/test_whatsapp_bridge_pidfile.py @@ -19,12 +19,17 @@ import time import pytest +import os +import socket + from gateway.platforms.whatsapp import ( _bridge_pid_is_ours, + _kill_port_process, _kill_stale_bridge_by_pidfile, + _listener_pids_on_port, _write_bridge_pidfile, ) -from gateway.status import get_process_start_time +from gateway.status import get_process_start_time, _pid_exists def _spawn_sleeper(*extra_argv) -> subprocess.Popen: @@ -116,3 +121,66 @@ class TestIdentityGuard: def test_missing_pidfile_is_noop(self, tmp_path): # No file -> must not raise. _kill_stale_bridge_by_pidfile(tmp_path) + + +class TestKillPortProcess: + """Freeing the bridge port must target only LISTENers, never clients. + + Root cause of the live Firefox kills: ``lsof -ti :PORT`` (and ``fuser + PORT/tcp``) also returned *client* sockets whose connection merely involved + the port number. The WhatsApp bridge uses port 3000 by default — a common + local dev-server port — so a browser tab on ``localhost:3000`` was matched + and SIGTERMed every time the (crash-looping) bridge restarted. + """ + + def test_listener_lookup_excludes_client_process(self): + srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + srv.bind(("127.0.0.1", 0)) + port = srv.getsockname()[1] + srv.listen(5) + # A separate process holding a *client* connection to that port. + client = subprocess.Popen([ + sys.executable, "-c", + "import socket,time; c=socket.create_connection(('127.0.0.1',%d)); time.sleep(30)" % port, + ]) + try: + conn, _ = srv.accept() # establish the client connection + pids = _listener_pids_on_port(port) + if os.getpid() not in pids: + pytest.skip("neither lsof nor ss detected the listener here") + # The listener (this process) is found; the client process is NOT — + # the LISTEN filter is what spares unrelated clients like a browser. + assert client.pid not in pids + conn.close() + finally: + client.kill() + client.wait() + srv.close() + + def test_kill_port_spares_client_process(self): + # Listener in a SEPARATE process — the legitimate kill target. This + # pytest process is the CLIENT: if port cleanup matched clients it would + # SIGTERM the test runner, so simply reaching the asserts proves the + # client was spared. + listener = subprocess.Popen( + [ + sys.executable, "-c", + "import socket,time;" + "s=socket.socket();s.setsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR,1);" + "s.bind(('127.0.0.1',0));print(s.getsockname()[1],flush=True);" + "s.listen(5);time.sleep(30)", + ], + stdout=subprocess.PIPE, text=True, + ) + try: + port = int(listener.stdout.readline().strip()) + cli = socket.create_connection(("127.0.0.1", port)) # we are the client + _kill_port_process(port) + assert _pid_exists(os.getpid()), "client (test process) must survive" + assert _wait_dead(listener, timeout=5.0), "stale listener should be killed" + cli.close() + finally: + if listener.poll() is None: + listener.kill() + listener.wait()