mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-24 10:52:21 +00:00
fix(whatsapp): only kill LISTENers when freeing the bridge port, never clients
This is the bug that was actually closing Firefox. `_kill_port_process`, run on every bridge (re)start to free the port, used `lsof -ti :PORT` / `fuser PORT/tcp` — both of which match a process whose socket merely *involves* that port number in ANY state, including ESTABLISHED client connections. It then SIGTERMed every match. The bridge defaults to port 3000 — a ubiquitous local dev-server port. With a browser tab open on localhost:3000, `lsof -ti :3000` returned Firefox's PID, so each restart of the (crash-looping) WhatsApp bridge SIGTERMed Firefox, closing the whole browser at irregular intervals with no crash and no coredump. Proven live with the kernel `signal:signal_generate` tracepoint: hermes-gateway(3396516) -> sig=15 (code=0/SI_USER) -> comm=firefox pid=3371585 captured immediately after a gateway start, while Firefox held a socket on the bridge port. Demonstrated over-match: `lsof -ti :8080` returns the listener AND the gateway's own client connection; `lsof -ti tcp:8080 -sTCP:LISTEN` returns only the listener. Fix: `_listener_pids_on_port` resolves only LISTEN-state sockets (`lsof -ti tcp:PORT -sTCP:LISTEN`, with an `ss -ltnp` fallback) and `_kill_port_process` signals just those. A client whose connection happens to involve the port number is never touched — which is also more correct, since a client never blocks the new bridge from binding. Windows already filtered LISTENING; the broad `fuser -k` path is removed. Adds TestKillPortProcess: real-socket tests proving a separate client process is excluded from the listener lookup and survives port cleanup. 9 tests green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
77fdbbfe81
commit
069ab40c5f
2 changed files with 115 additions and 30 deletions
|
|
@ -35,8 +35,46 @@ from hermes_constants import (
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _listener_pids_on_port(port: int) -> list:
|
||||
"""PIDs of processes *listening* on ``port`` (POSIX) — never clients.
|
||||
|
||||
This must match only LISTEN sockets. A bare ``lsof -i :PORT`` (or
|
||||
``fuser PORT/tcp``) also returns *clients* whose connection merely involves
|
||||
that port number — e.g. a browser with a tab open on a local dev server
|
||||
sharing the port. SIGTERMing those closed the user's browser at irregular
|
||||
intervals. Restricting to LISTEN state frees the port for a new bridge
|
||||
without ever touching an unrelated client.
|
||||
"""
|
||||
pids: list = []
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["lsof", "-ti", f"tcp:{port}", "-sTCP:LISTEN"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
for line in result.stdout.strip().splitlines():
|
||||
try:
|
||||
pids.append(int(line))
|
||||
except ValueError:
|
||||
pass
|
||||
if pids:
|
||||
return pids
|
||||
except FileNotFoundError:
|
||||
pass # lsof not installed — fall through to ss
|
||||
# Fallback: ss (iproute2, present on virtually every modern Linux).
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["ss", "-ltnHp", f"sport = :{port}"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
for m in re.finditer(r"pid=(\d+)", result.stdout):
|
||||
pids.append(int(m.group(1)))
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
return pids
|
||||
|
||||
|
||||
def _kill_port_process(port: int) -> None:
|
||||
"""Kill any process listening on the given TCP port."""
|
||||
"""Kill any process *listening* on the given TCP port (a stale bridge)."""
|
||||
try:
|
||||
if _IS_WINDOWS:
|
||||
# Use netstat to find the PID bound to this port, then taskkill
|
||||
|
|
@ -57,35 +95,14 @@ def _kill_port_process(port: int) -> None:
|
|||
except subprocess.SubprocessError:
|
||||
pass
|
||||
else:
|
||||
# Try fuser first (Linux), fall back to lsof (macOS / WSL2)
|
||||
killed = False
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["fuser", f"{port}/tcp"],
|
||||
capture_output=True, timeout=5,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
subprocess.run(
|
||||
["fuser", "-k", f"{port}/tcp"],
|
||||
capture_output=True, timeout=5,
|
||||
)
|
||||
killed = True
|
||||
except FileNotFoundError:
|
||||
pass # fuser not installed
|
||||
|
||||
if not killed:
|
||||
# POSIX: only ever signal a process LISTENING on the port. A client
|
||||
# whose connection happens to involve this port number (a browser
|
||||
# tab on a local dev server, etc.) must never be killed.
|
||||
for pid in _listener_pids_on_port(port):
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["lsof", "-ti", f":{port}"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
for pid_str in result.stdout.strip().splitlines():
|
||||
try:
|
||||
os.kill(int(pid_str), signal.SIGTERM)
|
||||
except (ValueError, ProcessLookupError, PermissionError):
|
||||
pass
|
||||
except FileNotFoundError:
|
||||
pass # lsof not installed either
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
|
|
|||
|
|
@ -19,12 +19,17 @@ import time
|
|||
|
||||
import pytest
|
||||
|
||||
import os
|
||||
import socket
|
||||
|
||||
from gateway.platforms.whatsapp import (
|
||||
_bridge_pid_is_ours,
|
||||
_kill_port_process,
|
||||
_kill_stale_bridge_by_pidfile,
|
||||
_listener_pids_on_port,
|
||||
_write_bridge_pidfile,
|
||||
)
|
||||
from gateway.status import get_process_start_time
|
||||
from gateway.status import get_process_start_time, _pid_exists
|
||||
|
||||
|
||||
def _spawn_sleeper(*extra_argv) -> subprocess.Popen:
|
||||
|
|
@ -116,3 +121,66 @@ class TestIdentityGuard:
|
|||
def test_missing_pidfile_is_noop(self, tmp_path):
|
||||
# No file -> must not raise.
|
||||
_kill_stale_bridge_by_pidfile(tmp_path)
|
||||
|
||||
|
||||
class TestKillPortProcess:
|
||||
"""Freeing the bridge port must target only LISTENers, never clients.
|
||||
|
||||
Root cause of the live Firefox kills: ``lsof -ti :PORT`` (and ``fuser
|
||||
PORT/tcp``) also returned *client* sockets whose connection merely involved
|
||||
the port number. The WhatsApp bridge uses port 3000 by default — a common
|
||||
local dev-server port — so a browser tab on ``localhost:3000`` was matched
|
||||
and SIGTERMed every time the (crash-looping) bridge restarted.
|
||||
"""
|
||||
|
||||
def test_listener_lookup_excludes_client_process(self):
|
||||
srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
srv.bind(("127.0.0.1", 0))
|
||||
port = srv.getsockname()[1]
|
||||
srv.listen(5)
|
||||
# A separate process holding a *client* connection to that port.
|
||||
client = subprocess.Popen([
|
||||
sys.executable, "-c",
|
||||
"import socket,time; c=socket.create_connection(('127.0.0.1',%d)); time.sleep(30)" % port,
|
||||
])
|
||||
try:
|
||||
conn, _ = srv.accept() # establish the client connection
|
||||
pids = _listener_pids_on_port(port)
|
||||
if os.getpid() not in pids:
|
||||
pytest.skip("neither lsof nor ss detected the listener here")
|
||||
# The listener (this process) is found; the client process is NOT —
|
||||
# the LISTEN filter is what spares unrelated clients like a browser.
|
||||
assert client.pid not in pids
|
||||
conn.close()
|
||||
finally:
|
||||
client.kill()
|
||||
client.wait()
|
||||
srv.close()
|
||||
|
||||
def test_kill_port_spares_client_process(self):
|
||||
# Listener in a SEPARATE process — the legitimate kill target. This
|
||||
# pytest process is the CLIENT: if port cleanup matched clients it would
|
||||
# SIGTERM the test runner, so simply reaching the asserts proves the
|
||||
# client was spared.
|
||||
listener = subprocess.Popen(
|
||||
[
|
||||
sys.executable, "-c",
|
||||
"import socket,time;"
|
||||
"s=socket.socket();s.setsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR,1);"
|
||||
"s.bind(('127.0.0.1',0));print(s.getsockname()[1],flush=True);"
|
||||
"s.listen(5);time.sleep(30)",
|
||||
],
|
||||
stdout=subprocess.PIPE, text=True,
|
||||
)
|
||||
try:
|
||||
port = int(listener.stdout.readline().strip())
|
||||
cli = socket.create_connection(("127.0.0.1", port)) # we are the client
|
||||
_kill_port_process(port)
|
||||
assert _pid_exists(os.getpid()), "client (test process) must survive"
|
||||
assert _wait_dead(listener, timeout=5.0), "stale listener should be killed"
|
||||
cli.close()
|
||||
finally:
|
||||
if listener.poll() is None:
|
||||
listener.kill()
|
||||
listener.wait()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue