mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-24 10:52:21 +00:00
fix(whatsapp): only kill LISTENers when freeing the bridge port, never clients
This is the bug that was actually closing Firefox. `_kill_port_process`, run on every bridge (re)start to free the port, used `lsof -ti :PORT` / `fuser PORT/tcp` — both of which match a process whose socket merely *involves* that port number in ANY state, including ESTABLISHED client connections. It then SIGTERMed every match. The bridge defaults to port 3000 — a ubiquitous local dev-server port. With a browser tab open on localhost:3000, `lsof -ti :3000` returned Firefox's PID, so each restart of the (crash-looping) WhatsApp bridge SIGTERMed Firefox, closing the whole browser at irregular intervals with no crash and no coredump. Proven live with the kernel `signal:signal_generate` tracepoint: hermes-gateway(3396516) -> sig=15 (code=0/SI_USER) -> comm=firefox pid=3371585 captured immediately after a gateway start, while Firefox held a socket on the bridge port. Demonstrated over-match: `lsof -ti :8080` returns the listener AND the gateway's own client connection; `lsof -ti tcp:8080 -sTCP:LISTEN` returns only the listener. Fix: `_listener_pids_on_port` resolves only LISTEN-state sockets (`lsof -ti tcp:PORT -sTCP:LISTEN`, with an `ss -ltnp` fallback) and `_kill_port_process` signals just those. A client whose connection happens to involve the port number is never touched — which is also more correct, since a client never blocks the new bridge from binding. Windows already filtered LISTENING; the broad `fuser -k` path is removed. Adds TestKillPortProcess: real-socket tests proving a separate client process is excluded from the listener lookup and survives port cleanup. 9 tests green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
77fdbbfe81
commit
069ab40c5f
2 changed files with 115 additions and 30 deletions
|
|
@ -35,8 +35,46 @@ from hermes_constants import (
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _listener_pids_on_port(port: int) -> list:
|
||||
"""PIDs of processes *listening* on ``port`` (POSIX) — never clients.
|
||||
|
||||
This must match only LISTEN sockets. A bare ``lsof -i :PORT`` (or
|
||||
``fuser PORT/tcp``) also returns *clients* whose connection merely involves
|
||||
that port number — e.g. a browser with a tab open on a local dev server
|
||||
sharing the port. SIGTERMing those closed the user's browser at irregular
|
||||
intervals. Restricting to LISTEN state frees the port for a new bridge
|
||||
without ever touching an unrelated client.
|
||||
"""
|
||||
pids: list = []
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["lsof", "-ti", f"tcp:{port}", "-sTCP:LISTEN"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
for line in result.stdout.strip().splitlines():
|
||||
try:
|
||||
pids.append(int(line))
|
||||
except ValueError:
|
||||
pass
|
||||
if pids:
|
||||
return pids
|
||||
except FileNotFoundError:
|
||||
pass # lsof not installed — fall through to ss
|
||||
# Fallback: ss (iproute2, present on virtually every modern Linux).
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["ss", "-ltnHp", f"sport = :{port}"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
for m in re.finditer(r"pid=(\d+)", result.stdout):
|
||||
pids.append(int(m.group(1)))
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
return pids
|
||||
|
||||
|
||||
def _kill_port_process(port: int) -> None:
|
||||
"""Kill any process listening on the given TCP port."""
|
||||
"""Kill any process *listening* on the given TCP port (a stale bridge)."""
|
||||
try:
|
||||
if _IS_WINDOWS:
|
||||
# Use netstat to find the PID bound to this port, then taskkill
|
||||
|
|
@ -57,35 +95,14 @@ def _kill_port_process(port: int) -> None:
|
|||
except subprocess.SubprocessError:
|
||||
pass
|
||||
else:
|
||||
# Try fuser first (Linux), fall back to lsof (macOS / WSL2)
|
||||
killed = False
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["fuser", f"{port}/tcp"],
|
||||
capture_output=True, timeout=5,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
subprocess.run(
|
||||
["fuser", "-k", f"{port}/tcp"],
|
||||
capture_output=True, timeout=5,
|
||||
)
|
||||
killed = True
|
||||
except FileNotFoundError:
|
||||
pass # fuser not installed
|
||||
|
||||
if not killed:
|
||||
# POSIX: only ever signal a process LISTENING on the port. A client
|
||||
# whose connection happens to involve this port number (a browser
|
||||
# tab on a local dev server, etc.) must never be killed.
|
||||
for pid in _listener_pids_on_port(port):
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["lsof", "-ti", f":{port}"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
for pid_str in result.stdout.strip().splitlines():
|
||||
try:
|
||||
os.kill(int(pid_str), signal.SIGTERM)
|
||||
except (ValueError, ProcessLookupError, PermissionError):
|
||||
pass
|
||||
except FileNotFoundError:
|
||||
pass # lsof not installed either
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue