fix(whatsapp): only kill LISTENers when freeing the bridge port, never clients

This is the bug that was actually closing Firefox. `_kill_port_process`, run on
every bridge (re)start to free the port, used `lsof -ti :PORT` / `fuser PORT/tcp`
— both of which match a process whose socket merely *involves* that port number
in ANY state, including ESTABLISHED client connections. It then SIGTERMed every
match.

The bridge defaults to port 3000 — a ubiquitous local dev-server port. With a
browser tab open on localhost:3000, `lsof -ti :3000` returned Firefox's PID, so
each restart of the (crash-looping) WhatsApp bridge SIGTERMed Firefox, closing
the whole browser at irregular intervals with no crash and no coredump.

Proven live with the kernel `signal:signal_generate` tracepoint:
  hermes-gateway(3396516) -> sig=15 (code=0/SI_USER) -> comm=firefox pid=3371585
captured immediately after a gateway start, while Firefox held a socket on the
bridge port. Demonstrated over-match: `lsof -ti :8080` returns the listener AND
the gateway's own client connection; `lsof -ti tcp:8080 -sTCP:LISTEN` returns
only the listener.

Fix: `_listener_pids_on_port` resolves only LISTEN-state sockets
(`lsof -ti tcp:PORT -sTCP:LISTEN`, with an `ss -ltnp` fallback) and
`_kill_port_process` signals just those. A client whose connection happens to
involve the port number is never touched — which is also more correct, since a
client never blocks the new bridge from binding. Windows already filtered
LISTENING; the broad `fuser -k` path is removed.

Adds TestKillPortProcess: real-socket tests proving a separate client process
is excluded from the listener lookup and survives port cleanup. 9 tests green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
valentt 2026-06-11 01:36:09 +02:00 committed by Teknium
parent 77fdbbfe81
commit 069ab40c5f
2 changed files with 115 additions and 30 deletions

View file

@ -35,8 +35,46 @@ from hermes_constants import (
logger = logging.getLogger(__name__)
def _listener_pids_on_port(port: int) -> list:
"""PIDs of processes *listening* on ``port`` (POSIX) — never clients.
This must match only LISTEN sockets. A bare ``lsof -i :PORT`` (or
``fuser PORT/tcp``) also returns *clients* whose connection merely involves
that port number e.g. a browser with a tab open on a local dev server
sharing the port. SIGTERMing those closed the user's browser at irregular
intervals. Restricting to LISTEN state frees the port for a new bridge
without ever touching an unrelated client.
"""
pids: list = []
try:
result = subprocess.run(
["lsof", "-ti", f"tcp:{port}", "-sTCP:LISTEN"],
capture_output=True, text=True, timeout=5,
)
for line in result.stdout.strip().splitlines():
try:
pids.append(int(line))
except ValueError:
pass
if pids:
return pids
except FileNotFoundError:
pass # lsof not installed — fall through to ss
# Fallback: ss (iproute2, present on virtually every modern Linux).
try:
result = subprocess.run(
["ss", "-ltnHp", f"sport = :{port}"],
capture_output=True, text=True, timeout=5,
)
for m in re.finditer(r"pid=(\d+)", result.stdout):
pids.append(int(m.group(1)))
except FileNotFoundError:
pass
return pids
def _kill_port_process(port: int) -> None:
"""Kill any process listening on the given TCP port."""
"""Kill any process *listening* on the given TCP port (a stale bridge)."""
try:
if _IS_WINDOWS:
# Use netstat to find the PID bound to this port, then taskkill
@ -57,35 +95,14 @@ def _kill_port_process(port: int) -> None:
except subprocess.SubprocessError:
pass
else:
# Try fuser first (Linux), fall back to lsof (macOS / WSL2)
killed = False
try:
result = subprocess.run(
["fuser", f"{port}/tcp"],
capture_output=True, timeout=5,
)
if result.returncode == 0:
subprocess.run(
["fuser", "-k", f"{port}/tcp"],
capture_output=True, timeout=5,
)
killed = True
except FileNotFoundError:
pass # fuser not installed
if not killed:
# POSIX: only ever signal a process LISTENING on the port. A client
# whose connection happens to involve this port number (a browser
# tab on a local dev server, etc.) must never be killed.
for pid in _listener_pids_on_port(port):
try:
result = subprocess.run(
["lsof", "-ti", f":{port}"],
capture_output=True, text=True, timeout=5,
)
for pid_str in result.stdout.strip().splitlines():
try:
os.kill(int(pid_str), signal.SIGTERM)
except (ValueError, ProcessLookupError, PermissionError):
pass
except FileNotFoundError:
pass # lsof not installed either
os.kill(pid, signal.SIGTERM)
except (ProcessLookupError, PermissionError, OSError):
pass
except Exception:
pass

View file

@ -19,12 +19,17 @@ import time
import pytest
import os
import socket
from gateway.platforms.whatsapp import (
_bridge_pid_is_ours,
_kill_port_process,
_kill_stale_bridge_by_pidfile,
_listener_pids_on_port,
_write_bridge_pidfile,
)
from gateway.status import get_process_start_time
from gateway.status import get_process_start_time, _pid_exists
def _spawn_sleeper(*extra_argv) -> subprocess.Popen:
@ -116,3 +121,66 @@ class TestIdentityGuard:
def test_missing_pidfile_is_noop(self, tmp_path):
# No file -> must not raise.
_kill_stale_bridge_by_pidfile(tmp_path)
class TestKillPortProcess:
"""Freeing the bridge port must target only LISTENers, never clients.
Root cause of the live Firefox kills: ``lsof -ti :PORT`` (and ``fuser
PORT/tcp``) also returned *client* sockets whose connection merely involved
the port number. The WhatsApp bridge uses port 3000 by default a common
local dev-server port so a browser tab on ``localhost:3000`` was matched
and SIGTERMed every time the (crash-looping) bridge restarted.
"""
def test_listener_lookup_excludes_client_process(self):
srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
srv.bind(("127.0.0.1", 0))
port = srv.getsockname()[1]
srv.listen(5)
# A separate process holding a *client* connection to that port.
client = subprocess.Popen([
sys.executable, "-c",
"import socket,time; c=socket.create_connection(('127.0.0.1',%d)); time.sleep(30)" % port,
])
try:
conn, _ = srv.accept() # establish the client connection
pids = _listener_pids_on_port(port)
if os.getpid() not in pids:
pytest.skip("neither lsof nor ss detected the listener here")
# The listener (this process) is found; the client process is NOT —
# the LISTEN filter is what spares unrelated clients like a browser.
assert client.pid not in pids
conn.close()
finally:
client.kill()
client.wait()
srv.close()
def test_kill_port_spares_client_process(self):
# Listener in a SEPARATE process — the legitimate kill target. This
# pytest process is the CLIENT: if port cleanup matched clients it would
# SIGTERM the test runner, so simply reaching the asserts proves the
# client was spared.
listener = subprocess.Popen(
[
sys.executable, "-c",
"import socket,time;"
"s=socket.socket();s.setsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR,1);"
"s.bind(('127.0.0.1',0));print(s.getsockname()[1],flush=True);"
"s.listen(5);time.sleep(30)",
],
stdout=subprocess.PIPE, text=True,
)
try:
port = int(listener.stdout.readline().strip())
cli = socket.create_connection(("127.0.0.1", port)) # we are the client
_kill_port_process(port)
assert _pid_exists(os.getpid()), "client (test process) must survive"
assert _wait_dead(listener, timeout=5.0), "stale listener should be killed"
cli.close()
finally:
if listener.poll() is None:
listener.kill()
listener.wait()