mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-09 03:11:58 +00:00
fix(windows): os.kill(pid, 0) is NOT a no-op on Windows — route through new _pid_exists helper
On Windows, Python's ``os.kill(pid, 0)`` is NOT a no-op. CPython's
implementation (``Modules/posixmodule.c::os_kill_impl``) treats sig=0
as ``CTRL_C_EVENT`` because the two integer values collide at the C
layer, and routes it through ``GenerateConsoleCtrlEvent(0, pid)`` —
which sends a Ctrl+C to the ENTIRE console process group containing
the target PID, not just the PID itself. Any caller that wanted to
check "is PID X alive" via the classic POSIX ``os.kill(pid, 0)``
idiom was silently killing that process (and often unrelated
processes in the same console group) on Windows. Long-standing
Python Windows quirk; see bpo-14484 (open since 2012).
This manifested in Hermes as: every ``hermes gateway status``
invocation would read the gateway's PID from the PID file, call
``os.kill(pid, 0)`` via ``gateway.status.get_running_pid()`` as a
"liveness check", and instantly terminate the gateway it was trying
to report on. No shutdown log, no traceback, no atexit hook fire,
no exit-diag entry — just silent termination of the detached pythonw
process. "Bot answered one message then stopped typing" was the
characteristic end-user symptom because `os.kill(pid, 0)` fires
mid-response-send and kills the gateway between logs.
Reproduction (verified in this branch before the fix):
$ hermes gateway start # gateway alive, PID 37520
$ hermes gateway status # reports "No gateway process detected"
$ tasklist /FI "PID eq 37520" # INFO: No tasks are running
# — gateway terminated silently
Root-cause fix is a new ``gateway.status._pid_exists(pid)`` helper:
- On Windows: Win32 ``OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION |
SYNCHRONIZE, False, pid)`` + ``WaitForSingleObject(handle, 0)``
via ctypes. Zero signal delivery, zero console-group side effects.
Pins ctypes return types to avoid DWORD-vs-signed-int parse bugs
on WAIT_TIMEOUT (0x102). Distinguishes ERROR_INVALID_PARAMETER
(PID gone) from ERROR_ACCESS_DENIED (alive but another user).
- On POSIX: the canonical ``os.kill(pid, 0)`` idiom that actually is
a no-op there.
Then patch every ``os.kill(pid, 0)`` liveness-check callsite to
route through ``_pid_exists`` instead. Total 14 callsites across
11 files; every single one was a latent silent-kill on Windows:
gateway/run.py:2810 — /restart watcher (inline subprocess)
gateway/run.py:15195 — --replace wait loop
gateway/status.py:572 — acquire_gateway_runtime_lock stale check
gateway/status.py:828 — get_running_pid (THE killer for status)
gateway/platforms/whatsapp.py:111
hermes_cli/gateway.py:228, 522, 1012 — gateway-related drain loops
hermes_cli/kanban_db.py:2826 — _pid_alive was claiming to
be cross-platform but used
os.kill(pid, 0) on Windows
hermes_cli/main.py:5792 — CLI process-kill polling
hermes_cli/profiles.py:782 — profile stop wait loop
plugins/google_meet/process_manager.py:74
tools/browser_tool.py:1215, 1255 — browser daemon ownership probes
tools/mcp_tool.py:1255, 3374 — MCP stdio orphan tracking
The watcher source in gateway/run.py:2810 is a multi-line string
that gets spawned as an inline ``python -c "..."`` subprocess, so
it can't import gateway.status. The fix for that callsite inlines
the same ctypes probe directly into the watcher source.
Tested on Windows 10 with the hermes gateway + Telegram bot:
- gateway start → alive
- 5 consecutive ``hermes gateway status`` invocations → gateway
alive after every one, same PID reported each time (37520, 21952)
- gateway.log shows uninterrupted operation; no spurious shutdown
entries; cron ticker and kanban dispatcher still running on
their 60-second cadence
- bot continues answering Telegram messages throughout
Ships alongside an exit-path diagnostic wrapper in
``hermes_cli/gateway.py::run_gateway()`` that captures every way
``asyncio.run(start_gateway(...))`` can return (success, SystemExit,
KeyboardInterrupt, BaseException, atexit) with full traceback to
``logs/gateway-exit-diag.log``. This was used to prove the gateway
was being hard-killed externally (no exit event fired) and should
be kept for future Windows debugging.
Refs: https://bugs.python.org/issue14484
See also: references/windows-subprocess-sigint-storm.md in
the hermes-agent skill.
This commit is contained in:
parent
ac178b78c4
commit
1cbe399149
11 changed files with 296 additions and 151 deletions
|
|
@ -107,12 +107,15 @@ def _kill_stale_bridge_by_pidfile(session_path: Path) -> None:
|
||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
try:
|
# ``os.kill(pid, 0)`` is NOT a no-op on Windows (bpo-14484) — use the
|
||||||
os.kill(pid, 0) # check existence
|
# cross-platform existence check before sending a real signal.
|
||||||
os.kill(pid, signal.SIGTERM)
|
from gateway.status import _pid_exists
|
||||||
logger.info("[whatsapp] Killed stale bridge PID %d from pidfile", pid)
|
if _pid_exists(pid):
|
||||||
except (ProcessLookupError, PermissionError, OSError):
|
try:
|
||||||
pass
|
os.kill(pid, signal.SIGTERM)
|
||||||
|
logger.info("[whatsapp] Killed stale bridge PID %d from pidfile", pid)
|
||||||
|
except (ProcessLookupError, PermissionError, OSError):
|
||||||
|
pass
|
||||||
try:
|
try:
|
||||||
pid_file.unlink()
|
pid_file.unlink()
|
||||||
except OSError:
|
except OSError:
|
||||||
|
|
|
||||||
|
|
@ -2805,10 +2805,36 @@ class GatewayRunner:
|
||||||
pid = int(sys.argv[1])
|
pid = int(sys.argv[1])
|
||||||
cmd = sys.argv[2:]
|
cmd = sys.argv[2:]
|
||||||
deadline = time.monotonic() + 120
|
deadline = time.monotonic() + 120
|
||||||
while time.monotonic() < deadline:
|
|
||||||
|
def _alive(p):
|
||||||
|
# On Windows, os.kill(pid, 0) is NOT a no-op — it maps to
|
||||||
|
# GenerateConsoleCtrlEvent(0, pid) (bpo-14484). Use the
|
||||||
|
# Win32 handle-based existence check instead.
|
||||||
|
if os.name == 'nt':
|
||||||
|
import ctypes
|
||||||
|
k32 = ctypes.windll.kernel32
|
||||||
|
k32.OpenProcess.restype = ctypes.c_void_p
|
||||||
|
k32.WaitForSingleObject.restype = ctypes.c_uint
|
||||||
|
k32.GetLastError.restype = ctypes.c_uint
|
||||||
|
h = k32.OpenProcess(0x1000 | 0x100000, False, int(p))
|
||||||
|
if not h:
|
||||||
|
return k32.GetLastError() != 87
|
||||||
|
try:
|
||||||
|
return k32.WaitForSingleObject(h, 0) == 0x102
|
||||||
|
finally:
|
||||||
|
k32.CloseHandle(h)
|
||||||
try:
|
try:
|
||||||
os.kill(pid, 0)
|
os.kill(int(p), 0)
|
||||||
except (ProcessLookupError, PermissionError, OSError):
|
return True
|
||||||
|
except ProcessLookupError:
|
||||||
|
return False
|
||||||
|
except PermissionError:
|
||||||
|
return True
|
||||||
|
except OSError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
while time.monotonic() < deadline:
|
||||||
|
if not _alive(pid):
|
||||||
break
|
break
|
||||||
time.sleep(0.2)
|
time.sleep(0.2)
|
||||||
_CREATE_NEW_PROCESS_GROUP = 0x00000200
|
_CREATE_NEW_PROCESS_GROUP = 0x00000200
|
||||||
|
|
@ -15189,16 +15215,14 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
return False
|
return False
|
||||||
# Wait up to 10 seconds for the old process to exit
|
# Wait up to 10 seconds for the old process to exit.
|
||||||
|
# ``os.kill(pid, 0)`` on Windows is NOT a no-op — use the
|
||||||
|
# handle-based existence check instead.
|
||||||
|
from gateway.status import _pid_exists
|
||||||
for _ in range(20):
|
for _ in range(20):
|
||||||
try:
|
if not _pid_exists(existing_pid):
|
||||||
os.kill(existing_pid, 0)
|
|
||||||
time.sleep(0.5)
|
|
||||||
except (ProcessLookupError, PermissionError, OSError):
|
|
||||||
# OSError covers Windows' WinError 87 "invalid parameter"
|
|
||||||
# for an already-gone PID — without this the probe loop
|
|
||||||
# busy-spins for the full 10s on every --replace start.
|
|
||||||
break # Process is gone
|
break # Process is gone
|
||||||
|
time.sleep(0.5)
|
||||||
else:
|
else:
|
||||||
# Still alive after 10s — force kill
|
# Still alive after 10s — force kill
|
||||||
logger.warning(
|
logger.warning(
|
||||||
|
|
|
||||||
|
|
@ -299,6 +299,71 @@ def _try_acquire_file_lock(handle) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _pid_exists(pid: int) -> bool:
|
||||||
|
"""Cross-platform "is this PID alive" check that does NOT kill the target.
|
||||||
|
|
||||||
|
CRITICAL on Windows: Python's ``os.kill(pid, 0)`` is NOT a no-op like it
|
||||||
|
is on POSIX. CPython's Windows implementation
|
||||||
|
(``Modules/posixmodule.c::os_kill_impl``) treats ``sig=0`` as
|
||||||
|
``CTRL_C_EVENT`` because the two values collide at the C level, and
|
||||||
|
routes it through ``GenerateConsoleCtrlEvent(0, pid)`` — which sends
|
||||||
|
a Ctrl+C to the entire console process group containing the target
|
||||||
|
PID, not just the PID itself. Any caller that wanted to "check if
|
||||||
|
this PID is alive" via ``os.kill(pid, 0)`` on Windows was silently
|
||||||
|
killing that process (and often unrelated processes in the same
|
||||||
|
console group). Long-standing Python quirk; see bpo-14484.
|
||||||
|
|
||||||
|
Fix: use the Win32 ``OpenProcess`` / ``WaitForSingleObject`` pair on
|
||||||
|
Windows to check existence without any signal path; use the POSIX
|
||||||
|
``os.kill(pid, 0)`` idiom on POSIX where it actually is a no-op.
|
||||||
|
"""
|
||||||
|
if _IS_WINDOWS:
|
||||||
|
try:
|
||||||
|
import ctypes
|
||||||
|
kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined]
|
||||||
|
# Pin return types — default ctypes restype is c_int (signed),
|
||||||
|
# which mangles WAIT_* DWORD return codes into negative numbers.
|
||||||
|
kernel32.OpenProcess.restype = ctypes.c_void_p
|
||||||
|
kernel32.WaitForSingleObject.restype = ctypes.c_uint
|
||||||
|
kernel32.GetLastError.restype = ctypes.c_uint
|
||||||
|
PROCESS_QUERY_LIMITED_INFORMATION = 0x1000
|
||||||
|
SYNCHRONIZE = 0x100000 # required for WaitForSingleObject
|
||||||
|
WAIT_TIMEOUT = 0x00000102
|
||||||
|
ERROR_INVALID_PARAMETER = 87
|
||||||
|
ERROR_ACCESS_DENIED = 5
|
||||||
|
handle = kernel32.OpenProcess(
|
||||||
|
PROCESS_QUERY_LIMITED_INFORMATION | SYNCHRONIZE, False, int(pid)
|
||||||
|
)
|
||||||
|
if not handle:
|
||||||
|
err = kernel32.GetLastError()
|
||||||
|
if err == ERROR_INVALID_PARAMETER:
|
||||||
|
return False # PID definitely gone
|
||||||
|
if err == ERROR_ACCESS_DENIED:
|
||||||
|
return True # Exists but owned by another user/session
|
||||||
|
return False # Conservative default for unknown errors
|
||||||
|
try:
|
||||||
|
wait_result = kernel32.WaitForSingleObject(handle, 0)
|
||||||
|
# WAIT_TIMEOUT = still running; anything else (WAIT_OBJECT_0
|
||||||
|
# via exit, WAIT_FAILED via handle issue) = treat as gone.
|
||||||
|
return wait_result == WAIT_TIMEOUT
|
||||||
|
finally:
|
||||||
|
kernel32.CloseHandle(handle)
|
||||||
|
except (OSError, AttributeError):
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
os.kill(int(pid), 0)
|
||||||
|
return True
|
||||||
|
except ProcessLookupError:
|
||||||
|
return False
|
||||||
|
except PermissionError:
|
||||||
|
# Process exists but we can't signal it — still alive.
|
||||||
|
return True
|
||||||
|
except OSError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _release_file_lock(handle) -> None:
|
def _release_file_lock(handle) -> None:
|
||||||
try:
|
try:
|
||||||
if _IS_WINDOWS:
|
if _IS_WINDOWS:
|
||||||
|
|
@ -503,10 +568,7 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str,
|
||||||
|
|
||||||
stale = existing_pid is None
|
stale = existing_pid is None
|
||||||
if not stale:
|
if not stale:
|
||||||
try:
|
if not _pid_exists(existing_pid):
|
||||||
os.kill(existing_pid, 0)
|
|
||||||
except (ProcessLookupError, PermissionError, OSError):
|
|
||||||
# Windows raises OSError with WinError 87 for invalid pid check
|
|
||||||
stale = True
|
stale = True
|
||||||
else:
|
else:
|
||||||
current_start = _get_process_start_time(existing_pid)
|
current_start = _get_process_start_time(existing_pid)
|
||||||
|
|
@ -517,7 +579,7 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str,
|
||||||
):
|
):
|
||||||
stale = True
|
stale = True
|
||||||
# Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped
|
# Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped
|
||||||
# processes still respond to os.kill(pid, 0) but are not
|
# processes still appear alive to _pid_exists but are not
|
||||||
# actually running. Treat them as stale so --replace works.
|
# actually running. Treat them as stale so --replace works.
|
||||||
if not stale:
|
if not stale:
|
||||||
try:
|
try:
|
||||||
|
|
@ -824,20 +886,7 @@ def get_running_pid(
|
||||||
if pid is None:
|
if pid is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
if not _pid_exists(pid):
|
||||||
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
|
|
||||||
except ProcessLookupError:
|
|
||||||
continue
|
|
||||||
except PermissionError:
|
|
||||||
# The process exists but belongs to another user/service scope.
|
|
||||||
# With the runtime lock still held, prefer keeping it visible
|
|
||||||
# rather than deleting the PID file as "stale".
|
|
||||||
if _record_looks_like_gateway(record):
|
|
||||||
return pid
|
|
||||||
continue
|
|
||||||
except OSError:
|
|
||||||
# Windows raises OSError with WinError 87 for an invalid pid
|
|
||||||
# (process is definitely gone). Treat as "process doesn't exist".
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
recorded_start = record.get("start_time")
|
recorded_start = record.get("start_time")
|
||||||
|
|
|
||||||
|
|
@ -223,18 +223,15 @@ def _graceful_restart_via_sigusr1(pid: int, drain_timeout: float) -> bool:
|
||||||
import time as _time
|
import time as _time
|
||||||
|
|
||||||
deadline = _time.monotonic() + max(drain_timeout, 1.0)
|
deadline = _time.monotonic() + max(drain_timeout, 1.0)
|
||||||
|
# IMPORTANT Windows note: ``os.kill(pid, 0)`` is NOT a no-op on
|
||||||
|
# Windows — Python's implementation calls ``TerminateProcess(handle, 0)``
|
||||||
|
# for sig=0, hard-killing the target. Use the cross-platform
|
||||||
|
# ``_pid_exists`` helper in gateway.status which does OpenProcess +
|
||||||
|
# WaitForSingleObject on Windows.
|
||||||
|
from gateway.status import _pid_exists
|
||||||
|
|
||||||
while _time.monotonic() < deadline:
|
while _time.monotonic() < deadline:
|
||||||
try:
|
if not _pid_exists(pid):
|
||||||
os.kill(pid, 0) # signal 0 — probe liveness
|
|
||||||
except ProcessLookupError:
|
|
||||||
return True
|
|
||||||
except PermissionError:
|
|
||||||
# Process still exists but we can't signal it. Treat as alive
|
|
||||||
# so the caller falls back.
|
|
||||||
pass
|
|
||||||
except OSError:
|
|
||||||
# Windows raises OSError (WinError 87 "invalid parameter") for
|
|
||||||
# a gone PID — treat the same as ProcessLookupError.
|
|
||||||
return True
|
return True
|
||||||
_time.sleep(0.5)
|
_time.sleep(0.5)
|
||||||
# Drain didn't finish in time.
|
# Drain didn't finish in time.
|
||||||
|
|
@ -303,6 +300,11 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
|
||||||
or f"HERMES_HOME={current_home}" in command
|
or f"HERMES_HOME={current_home}" in command
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Default-profile case: no profile flag in argv. Accept as long as
|
||||||
|
# the command doesn't advertise *some other* profile. HERMES_HOME
|
||||||
|
# may be passed via env (not visible in wmic/CIM command line) so
|
||||||
|
# its absence is NOT disqualifying — only a non-matching explicit
|
||||||
|
# HERMES_HOME= in argv is.
|
||||||
if "--profile " in command or " -p " in command:
|
if "--profile " in command or " -p " in command:
|
||||||
return False
|
return False
|
||||||
if "HERMES_HOME=" in command and f"HERMES_HOME={current_home}" not in command:
|
if "HERMES_HOME=" in command and f"HERMES_HOME={current_home}" not in command:
|
||||||
|
|
@ -513,14 +515,10 @@ def launch_detached_profile_gateway_restart(profile: str, old_pid: int) -> bool:
|
||||||
cmd = sys.argv[2:]
|
cmd = sys.argv[2:]
|
||||||
deadline = time.monotonic() + 120
|
deadline = time.monotonic() + 120
|
||||||
while time.monotonic() < deadline:
|
while time.monotonic() < deadline:
|
||||||
try:
|
# ``os.kill(pid, 0)`` is not a no-op on Windows — use the
|
||||||
os.kill(pid, 0)
|
# cross-platform existence check.
|
||||||
except ProcessLookupError:
|
from gateway.status import _pid_exists
|
||||||
break
|
if not _pid_exists(pid):
|
||||||
except PermissionError:
|
|
||||||
pass
|
|
||||||
except OSError:
|
|
||||||
# Windows: gone PID raises OSError (WinError 87).
|
|
||||||
break
|
break
|
||||||
time.sleep(0.2)
|
time.sleep(0.2)
|
||||||
|
|
||||||
|
|
@ -1007,15 +1005,14 @@ def stop_profile_gateway() -> bool:
|
||||||
print(f"⚠ Permission denied to kill PID {pid}")
|
print(f"⚠ Permission denied to kill PID {pid}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Wait briefly for it to exit
|
# Wait briefly for it to exit. On Windows, os.kill(pid, 0) is NOT
|
||||||
|
# a no-op — route through the cross-platform existence check.
|
||||||
import time as _time
|
import time as _time
|
||||||
|
from gateway.status import _pid_exists
|
||||||
for _ in range(20):
|
for _ in range(20):
|
||||||
try:
|
if not _pid_exists(pid):
|
||||||
os.kill(pid, 0)
|
|
||||||
_time.sleep(0.5)
|
|
||||||
except (ProcessLookupError, PermissionError, OSError):
|
|
||||||
# OSError covers Windows' WinError 87 for gone PIDs.
|
|
||||||
break
|
break
|
||||||
|
_time.sleep(0.5)
|
||||||
|
|
||||||
if get_running_pid() is None:
|
if get_running_pid() is None:
|
||||||
remove_pid_file()
|
remove_pid_file()
|
||||||
|
|
@ -2910,22 +2907,49 @@ def run_gateway(verbose: int = 0, quiet: bool = False, replace: bool = False):
|
||||||
# stays alive; real user Ctrl+C still comes through prompt_toolkit /
|
# stays alive; real user Ctrl+C still comes through prompt_toolkit /
|
||||||
# the asyncio signal handler when running in a real console.
|
# the asyncio signal handler when running in a real console.
|
||||||
#
|
#
|
||||||
# Detection: sys.stdin is None / stdin-not-a-tty AND we're on Windows.
|
# IMPORTANT lesson (May 2026): we originally gated this on "stdin is
|
||||||
# This mirrors the same pattern as ``cli.py`` (commit 449ad952b).
|
# NOT a TTY" assuming only detached pythonw runs would be vulnerable.
|
||||||
|
# Wrong. When the user runs `hermes gateway start` from a PowerShell
|
||||||
|
# console, the gateway inherits that console and stdin IS a TTY —
|
||||||
|
# but it's STILL vulnerable to CTRL_C_EVENT broadcast by any sibling
|
||||||
|
# `hermes` invocation (like `hermes gateway status` 30 seconds later)
|
||||||
|
# because Windows routes console events to all processes sharing the
|
||||||
|
# console. Every hermes CLI process after that sibling fires is a
|
||||||
|
# potential drive-by killer. So on Windows, for `gateway run`
|
||||||
|
# specifically (never interactive by design), always install the
|
||||||
|
# SIGINT absorber regardless of TTY state.
|
||||||
|
try:
|
||||||
|
_stdin_is_tty = bool(sys.stdin and sys.stdin.isatty())
|
||||||
|
except (ValueError, OSError):
|
||||||
|
_stdin_is_tty = False
|
||||||
if is_windows():
|
if is_windows():
|
||||||
try:
|
try:
|
||||||
_stdin_is_tty = bool(sys.stdin and sys.stdin.isatty())
|
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
||||||
except (ValueError, OSError):
|
if hasattr(signal, "SIGBREAK"):
|
||||||
_stdin_is_tty = False
|
signal.signal(signal.SIGBREAK, signal.SIG_IGN)
|
||||||
if not _stdin_is_tty:
|
except (OSError, ValueError):
|
||||||
try:
|
# SetConsoleCtrlHandler not available (rare on Windows) —
|
||||||
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
# best-effort, proceed either way.
|
||||||
if hasattr(signal, "SIGBREAK"):
|
pass
|
||||||
signal.signal(signal.SIGBREAK, signal.SIG_IGN)
|
# Python's signal module only hooks SIGINT/SIGBREAK. To also
|
||||||
except (OSError, ValueError):
|
# absorb CTRL_CLOSE_EVENT / CTRL_LOGOFF_EVENT and any other
|
||||||
# SetConsoleCtrlHandler not available (rare on Windows) —
|
# console control signals Windows may broadcast to the console
|
||||||
# best-effort, proceed either way.
|
# process group, call the native SetConsoleCtrlHandler(NULL, TRUE)
|
||||||
pass
|
# — this tells the kernel to IGNORE all console control events
|
||||||
|
# for this process entirely, which is what background services
|
||||||
|
# are supposed to do. Belt-and-braces over the Python-level
|
||||||
|
# handlers above.
|
||||||
|
try:
|
||||||
|
import ctypes
|
||||||
|
kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined]
|
||||||
|
# BOOL SetConsoleCtrlHandler(NULL, Add) — Add=TRUE means
|
||||||
|
# "install the NULL handler", which has the documented
|
||||||
|
# effect of ignoring Ctrl+C. Called twice for defense in
|
||||||
|
# depth: once before any Python import could have flipped
|
||||||
|
# our disposition, once as our last word.
|
||||||
|
kernel32.SetConsoleCtrlHandler(None, 1)
|
||||||
|
except (OSError, AttributeError):
|
||||||
|
pass
|
||||||
|
|
||||||
# Refresh the systemd unit definition on every boot so that restart
|
# Refresh the systemd unit definition on every boot so that restart
|
||||||
# settings (RestartSec, StartLimitIntervalSec, etc.) stay current even
|
# settings (RestartSec, StartLimitIntervalSec, etc.) stay current even
|
||||||
|
|
@ -2954,15 +2978,86 @@ def run_gateway(verbose: int = 0, quiet: bool = False, replace: bool = False):
|
||||||
# Exit with code 1 if gateway fails to connect any platform,
|
# Exit with code 1 if gateway fails to connect any platform,
|
||||||
# so systemd Restart=always will retry on transient errors
|
# so systemd Restart=always will retry on transient errors
|
||||||
verbosity = None if quiet else verbose
|
verbosity = None if quiet else verbose
|
||||||
|
|
||||||
|
# ── Exit-path diagnostics ────────────────────────────────────────────
|
||||||
|
# When the gateway dies silently on Windows (no shutdown log, no
|
||||||
|
# traceback in gateway.log / errors.log), we're usually blind to the
|
||||||
|
# cause. The code below captures *every* way the asyncio.run() call
|
||||||
|
# below can return, with full context dumped to a dedicated log so
|
||||||
|
# the next silent death yields evidence instead of a mystery. This
|
||||||
|
# is diagnostic scaffolding; cheap to keep on, costs nothing during
|
||||||
|
# normal operation, and the emitted lines are opt-in via the
|
||||||
|
# HERMES_GATEWAY_EXIT_DIAG env var (default: on while we're still
|
||||||
|
# chasing the Windows lifecycle bug).
|
||||||
|
import atexit as _atexit
|
||||||
|
import traceback as _traceback
|
||||||
|
from datetime import datetime as _dt, timezone as _tz
|
||||||
|
|
||||||
|
def _exit_diag(tag: str, **extra: object) -> None:
|
||||||
|
if os.environ.get("HERMES_GATEWAY_EXIT_DIAG", "1") != "1":
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
from hermes_constants import get_hermes_home as _ghh
|
||||||
|
log_dir = _ghh() / "logs"
|
||||||
|
log_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
ts = _dt.now(_tz.utc).isoformat()
|
||||||
|
line = {
|
||||||
|
"ts": ts,
|
||||||
|
"tag": tag,
|
||||||
|
"pid": os.getpid(),
|
||||||
|
"python": sys.version.split()[0],
|
||||||
|
"platform": sys.platform,
|
||||||
|
**extra,
|
||||||
|
}
|
||||||
|
import json as _json
|
||||||
|
with open(log_dir / "gateway-exit-diag.log", "a", encoding="utf-8") as f:
|
||||||
|
f.write(_json.dumps(line, default=str) + "\n")
|
||||||
|
except Exception:
|
||||||
|
pass # never let the diagnostic itself crash the gateway
|
||||||
|
|
||||||
|
_exit_diag(
|
||||||
|
"gateway.start",
|
||||||
|
replace=replace,
|
||||||
|
argv=sys.argv,
|
||||||
|
stdin_is_tty=_stdin_is_tty,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _atexit_hook() -> None:
|
||||||
|
_exit_diag("atexit.hook", sys_exc=repr(sys.exc_info()))
|
||||||
|
|
||||||
|
_atexit.register(_atexit_hook)
|
||||||
|
|
||||||
|
success = False
|
||||||
try:
|
try:
|
||||||
success = asyncio.run(start_gateway(replace=replace, verbosity=verbosity))
|
success = asyncio.run(start_gateway(replace=replace, verbosity=verbosity))
|
||||||
|
_exit_diag("asyncio.run.returned", success=success)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
# On Windows-detached runs this shouldn't fire (we absorb SIGINT above),
|
# On Windows-detached runs this shouldn't fire (we absorb SIGINT above),
|
||||||
# but keep the handler for console runs.
|
# but keep the handler for console runs.
|
||||||
|
_exit_diag(
|
||||||
|
"asyncio.run.KeyboardInterrupt",
|
||||||
|
traceback=_traceback.format_exc(),
|
||||||
|
)
|
||||||
print("\nGateway stopped.")
|
print("\nGateway stopped.")
|
||||||
return
|
return
|
||||||
|
except SystemExit as e:
|
||||||
|
_exit_diag("asyncio.run.SystemExit", code=getattr(e, "code", None),
|
||||||
|
traceback=_traceback.format_exc())
|
||||||
|
raise
|
||||||
|
except BaseException as e:
|
||||||
|
# Absolutely everything else: Exception, asyncio.CancelledError,
|
||||||
|
# even exotic BaseException subclasses. We want the cause logged.
|
||||||
|
_exit_diag(
|
||||||
|
"asyncio.run.exception",
|
||||||
|
exc_type=type(e).__name__,
|
||||||
|
exc_repr=repr(e),
|
||||||
|
traceback=_traceback.format_exc(),
|
||||||
|
)
|
||||||
|
raise
|
||||||
if not success:
|
if not success:
|
||||||
|
_exit_diag("gateway.exit_nonzero")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
_exit_diag("gateway.exit_clean")
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
|
||||||
|
|
@ -2805,12 +2805,18 @@ def _classify_worker_exit(pid: int) -> "tuple[str, Optional[int]]":
|
||||||
def _pid_alive(pid: Optional[int]) -> bool:
|
def _pid_alive(pid: Optional[int]) -> bool:
|
||||||
"""Return True if ``pid`` is still running on this host.
|
"""Return True if ``pid`` is still running on this host.
|
||||||
|
|
||||||
Cross-platform: uses ``os.kill(pid, 0)`` on POSIX and ``OpenProcess``
|
Cross-platform: uses ``OpenProcess`` + ``WaitForSingleObject`` on
|
||||||
on Windows. Returns False for falsy PIDs or on any OS error.
|
Windows (via ``gateway.status._pid_exists``) and ``os.kill(pid, 0)``
|
||||||
|
on POSIX. Returns False for falsy PIDs or on any OS error.
|
||||||
|
|
||||||
**Zombie handling:** ``os.kill(pid, 0)`` succeeds against
|
**DO NOT** use ``os.kill(pid, 0)`` directly on Windows — Python's
|
||||||
zombie processes (post-exit, pre-reap) because the process table
|
Windows ``os.kill`` treats ``sig=0`` as ``CTRL_C_EVENT`` (bpo-14484)
|
||||||
entry still exists. A worker that exits without being reaped by its
|
and will broadcast it to the target's console group, potentially
|
||||||
|
killing unrelated processes.
|
||||||
|
|
||||||
|
**Zombie handling:** the existence check succeeds against zombie
|
||||||
|
processes (post-exit, pre-reap) because the process table entry
|
||||||
|
still exists. A worker that exits without being reaped by its
|
||||||
parent would stay "alive" to the dispatcher forever. Dispatcher
|
parent would stay "alive" to the dispatcher forever. Dispatcher
|
||||||
workers are started via ``start_new_session=True`` + intentional
|
workers are started via ``start_new_session=True`` + intentional
|
||||||
Popen handle abandonment, so init reaps them quickly — but during
|
Popen handle abandonment, so init reaps them quickly — but during
|
||||||
|
|
@ -2821,17 +2827,10 @@ def _pid_alive(pid: Optional[int]) -> bool:
|
||||||
"""
|
"""
|
||||||
if not pid or pid <= 0:
|
if not pid or pid <= 0:
|
||||||
return False
|
return False
|
||||||
try:
|
from gateway.status import _pid_exists
|
||||||
if hasattr(os, "kill"):
|
if not _pid_exists(int(pid)):
|
||||||
os.kill(int(pid), 0)
|
|
||||||
except ProcessLookupError:
|
|
||||||
return False
|
return False
|
||||||
except PermissionError:
|
# Still here → process exists. Check for zombie on platforms
|
||||||
# Process exists, we just can't signal it.
|
|
||||||
return True
|
|
||||||
except OSError:
|
|
||||||
return False
|
|
||||||
# Still here → kill(0) succeeded. Check for zombie on platforms
|
|
||||||
# where we have a cheap, deterministic process-state probe.
|
# where we have a cheap, deterministic process-state probe.
|
||||||
if sys.platform == "linux":
|
if sys.platform == "linux":
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -5787,16 +5787,14 @@ def _kill_stale_dashboard_processes(
|
||||||
while pending and _time.monotonic() < deadline:
|
while pending and _time.monotonic() < deadline:
|
||||||
_time.sleep(0.1)
|
_time.sleep(0.1)
|
||||||
still_pending = []
|
still_pending = []
|
||||||
|
# On Windows, os.kill(pid, 0) is NOT a no-op. Route through
|
||||||
|
# the cross-platform existence check.
|
||||||
|
from gateway.status import _pid_exists
|
||||||
for pid in pending:
|
for pid in pending:
|
||||||
try:
|
if _pid_exists(pid):
|
||||||
os.kill(pid, 0) # probe
|
|
||||||
except ProcessLookupError:
|
|
||||||
killed.append(pid)
|
|
||||||
except (PermissionError, OSError):
|
|
||||||
# Can't probe — assume still there.
|
|
||||||
still_pending.append(pid)
|
still_pending.append(pid)
|
||||||
else:
|
else:
|
||||||
still_pending.append(pid)
|
killed.append(pid)
|
||||||
pending = still_pending
|
pending = still_pending
|
||||||
|
|
||||||
# SIGKILL any survivors.
|
# SIGKILL any survivors.
|
||||||
|
|
|
||||||
|
|
@ -774,15 +774,13 @@ def _stop_gateway_process(profile_dir: Path) -> None:
|
||||||
# and raw os.kill with SIGTERM doesn't cascade to child processes
|
# and raw os.kill with SIGTERM doesn't cascade to child processes
|
||||||
# the same way taskkill /T does.
|
# the same way taskkill /T does.
|
||||||
from gateway.status import terminate_pid as _terminate_pid
|
from gateway.status import terminate_pid as _terminate_pid
|
||||||
|
from gateway.status import _pid_exists
|
||||||
_terminate_pid(pid) # graceful first
|
_terminate_pid(pid) # graceful first
|
||||||
# Wait up to 10s for graceful shutdown
|
# Wait up to 10s for graceful shutdown. On Windows, os.kill(pid, 0)
|
||||||
|
# is NOT a no-op — use the handle-based existence check.
|
||||||
for _ in range(20):
|
for _ in range(20):
|
||||||
_time.sleep(0.5)
|
_time.sleep(0.5)
|
||||||
try:
|
if not _pid_exists(pid):
|
||||||
os.kill(pid, 0)
|
|
||||||
except (ProcessLookupError, OSError):
|
|
||||||
# OSError covers Windows' WinError 87 "invalid parameter"
|
|
||||||
# returned for an invalid/gone PID probe.
|
|
||||||
print(f"✓ Gateway stopped (PID {pid})")
|
print(f"✓ Gateway stopped (PID {pid})")
|
||||||
return
|
return
|
||||||
# Force kill
|
# Force kill
|
||||||
|
|
|
||||||
|
|
@ -70,14 +70,11 @@ def _clear_active() -> None:
|
||||||
|
|
||||||
|
|
||||||
def _pid_alive(pid: int) -> bool:
|
def _pid_alive(pid: int) -> bool:
|
||||||
try:
|
# ``os.kill(pid, 0)`` is NOT a no-op on Windows (bpo-14484) — it
|
||||||
os.kill(pid, 0)
|
# routes through GenerateConsoleCtrlEvent and can kill the target.
|
||||||
except ProcessLookupError:
|
# Use the cross-platform existence check.
|
||||||
return False
|
from gateway.status import _pid_exists
|
||||||
except PermissionError:
|
return _pid_exists(pid)
|
||||||
# Process exists but we can't signal it — treat as alive.
|
|
||||||
return True
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -1211,19 +1211,10 @@ def _reap_orphaned_browser_sessions():
|
||||||
if os.path.isfile(owner_pid_file):
|
if os.path.isfile(owner_pid_file):
|
||||||
try:
|
try:
|
||||||
owner_pid = int(Path(owner_pid_file).read_text(encoding="utf-8").strip())
|
owner_pid = int(Path(owner_pid_file).read_text(encoding="utf-8").strip())
|
||||||
try:
|
# ``os.kill(pid, 0)`` is NOT a no-op on Windows (bpo-14484).
|
||||||
os.kill(owner_pid, 0)
|
# Use the cross-platform existence check.
|
||||||
owner_alive = True
|
from gateway.status import _pid_exists
|
||||||
except ProcessLookupError:
|
owner_alive = _pid_exists(owner_pid)
|
||||||
owner_alive = False
|
|
||||||
except PermissionError:
|
|
||||||
# Owner exists but we can't signal it (different uid).
|
|
||||||
# Treat as alive — don't reap someone else's session.
|
|
||||||
owner_alive = True
|
|
||||||
except OSError:
|
|
||||||
# Windows: gone PID raises OSError (WinError 87) instead
|
|
||||||
# of ProcessLookupError. Treat as dead to match POSIX.
|
|
||||||
owner_alive = False
|
|
||||||
except (ValueError, OSError):
|
except (ValueError, OSError):
|
||||||
owner_alive = None # corrupt file — fall through
|
owner_alive = None # corrupt file — fall through
|
||||||
|
|
||||||
|
|
@ -1250,19 +1241,10 @@ def _reap_orphaned_browser_sessions():
|
||||||
shutil.rmtree(socket_dir, ignore_errors=True)
|
shutil.rmtree(socket_dir, ignore_errors=True)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check if the daemon is still alive
|
# Check if the daemon is still alive. ``os.kill(pid, 0)`` on Windows
|
||||||
try:
|
# is NOT a no-op — use the handle-based existence check.
|
||||||
os.kill(daemon_pid, 0) # signal 0 = existence check
|
from gateway.status import _pid_exists
|
||||||
except ProcessLookupError:
|
if not _pid_exists(daemon_pid):
|
||||||
# Already dead, just clean up the dir
|
|
||||||
shutil.rmtree(socket_dir, ignore_errors=True)
|
|
||||||
continue
|
|
||||||
except PermissionError:
|
|
||||||
# Alive but owned by someone else — leave it alone
|
|
||||||
continue
|
|
||||||
except OSError:
|
|
||||||
# Windows raises OSError (WinError 87) for a gone PID — treat
|
|
||||||
# as dead and clean up, mirroring the ProcessLookupError branch.
|
|
||||||
shutil.rmtree(socket_dir, ignore_errors=True)
|
shutil.rmtree(socket_dir, ignore_errors=True)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1251,9 +1251,10 @@ class MCPServerTask:
|
||||||
for _pid in new_pids:
|
for _pid in new_pids:
|
||||||
_stdio_pids.pop(_pid, None)
|
_stdio_pids.pop(_pid, None)
|
||||||
for pid in new_pids:
|
for pid in new_pids:
|
||||||
try:
|
# ``os.kill(pid, 0)`` is NOT a no-op on Windows
|
||||||
os.kill(pid, 0) # signal 0: probe liveness only
|
# (bpo-14484). Use the cross-platform check.
|
||||||
except (ProcessLookupError, PermissionError, OSError):
|
from gateway.status import _pid_exists
|
||||||
|
if not _pid_exists(pid):
|
||||||
continue # process already exited — nothing to do
|
continue # process already exited — nothing to do
|
||||||
_orphan_stdio_pids.add(pid)
|
_orphan_stdio_pids.add(pid)
|
||||||
|
|
||||||
|
|
@ -3369,16 +3370,20 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None:
|
||||||
|
|
||||||
# Phase 3: SIGKILL any survivors
|
# Phase 3: SIGKILL any survivors
|
||||||
_sigkill = getattr(_signal, "SIGKILL", _signal.SIGTERM)
|
_sigkill = getattr(_signal, "SIGKILL", _signal.SIGTERM)
|
||||||
|
# ``os.kill(pid, 0)`` is NOT a no-op on Windows. Use the cross-platform
|
||||||
|
# existence check before escalating to SIGKILL.
|
||||||
|
from gateway.status import _pid_exists
|
||||||
for pid, server_name in pids.items():
|
for pid, server_name in pids.items():
|
||||||
|
if not _pid_exists(pid):
|
||||||
|
continue # Good — exited after SIGTERM
|
||||||
try:
|
try:
|
||||||
os.kill(pid, 0) # Check if still alive
|
|
||||||
os.kill(pid, _sigkill)
|
os.kill(pid, _sigkill)
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Force-killed MCP process %d (%s) after SIGTERM timeout",
|
"Force-killed MCP process %d (%s) after SIGTERM timeout",
|
||||||
pid, server_name,
|
pid, server_name,
|
||||||
)
|
)
|
||||||
except (ProcessLookupError, PermissionError, OSError):
|
except (ProcessLookupError, PermissionError, OSError):
|
||||||
pass # Good — exited after SIGTERM
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _stop_mcp_loop():
|
def _stop_mcp_loop():
|
||||||
|
|
|
||||||
|
|
@ -404,15 +404,10 @@ class ProcessRegistry:
|
||||||
"""Best-effort liveness check for host-visible PIDs."""
|
"""Best-effort liveness check for host-visible PIDs."""
|
||||||
if not pid:
|
if not pid:
|
||||||
return False
|
return False
|
||||||
try:
|
# ``os.kill(pid, 0)`` is NOT a no-op on Windows (bpo-14484) — use
|
||||||
os.kill(pid, 0)
|
# the cross-platform existence check.
|
||||||
return True
|
from gateway.status import _pid_exists
|
||||||
except (ProcessLookupError, PermissionError, OSError):
|
return _pid_exists(pid)
|
||||||
# OSError covers Windows' WinError 87 for a gone PID, and the
|
|
||||||
# ``WinError 5 Access denied`` case — treat both as "can't probe
|
|
||||||
# or process is gone", which matches the conservative
|
|
||||||
# "not alive" semantics callers already handle.
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _refresh_detached_session(self, session: Optional[ProcessSession]) -> Optional[ProcessSession]:
|
def _refresh_detached_session(self, session: Optional[ProcessSession]) -> Optional[ProcessSession]:
|
||||||
"""Update recovered host-PID sessions when the underlying process has exited."""
|
"""Update recovered host-PID sessions when the underlying process has exited."""
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue