mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-09 03:11:58 +00:00
fix(windows): os.kill(pid, 0) is NOT a no-op on Windows — route through new _pid_exists helper
On Windows, Python's ``os.kill(pid, 0)`` is NOT a no-op. CPython's
implementation (``Modules/posixmodule.c::os_kill_impl``) treats sig=0
as ``CTRL_C_EVENT`` because the two integer values collide at the C
layer, and routes it through ``GenerateConsoleCtrlEvent(0, pid)`` —
which sends a Ctrl+C to the ENTIRE console process group containing
the target PID, not just the PID itself. Any caller that wanted to
check "is PID X alive" via the classic POSIX ``os.kill(pid, 0)``
idiom was silently killing that process (and often unrelated
processes in the same console group) on Windows. Long-standing
Python Windows quirk; see bpo-14484 (open since 2012).
This manifested in Hermes as: every ``hermes gateway status``
invocation would read the gateway's PID from the PID file, call
``os.kill(pid, 0)`` via ``gateway.status.get_running_pid()`` as a
"liveness check", and instantly terminate the gateway it was trying
to report on. No shutdown log, no traceback, no atexit hook fire,
no exit-diag entry — just silent termination of the detached pythonw
process. "Bot answered one message then stopped typing" was the
characteristic end-user symptom because `os.kill(pid, 0)` fires
mid-response-send and kills the gateway between logs.
Reproduction (verified in this branch before the fix):
$ hermes gateway start # gateway alive, PID 37520
$ hermes gateway status # reports "No gateway process detected"
$ tasklist /FI "PID eq 37520" # INFO: No tasks are running
# — gateway terminated silently
Root-cause fix is a new ``gateway.status._pid_exists(pid)`` helper:
- On Windows: Win32 ``OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION |
SYNCHRONIZE, False, pid)`` + ``WaitForSingleObject(handle, 0)``
via ctypes. Zero signal delivery, zero console-group side effects.
Pins ctypes return types to avoid DWORD-vs-signed-int parse bugs
on WAIT_TIMEOUT (0x102). Distinguishes ERROR_INVALID_PARAMETER
(PID gone) from ERROR_ACCESS_DENIED (alive but another user).
- On POSIX: the canonical ``os.kill(pid, 0)`` idiom that actually is
a no-op there.
Then patch every ``os.kill(pid, 0)`` liveness-check callsite to
route through ``_pid_exists`` instead. Total 14 callsites across
11 files; every single one was a latent silent-kill on Windows:
gateway/run.py:2810 — /restart watcher (inline subprocess)
gateway/run.py:15195 — --replace wait loop
gateway/status.py:572 — acquire_gateway_runtime_lock stale check
gateway/status.py:828 — get_running_pid (THE killer for status)
gateway/platforms/whatsapp.py:111
hermes_cli/gateway.py:228, 522, 1012 — gateway-related drain loops
hermes_cli/kanban_db.py:2826 — _pid_alive was claiming to
be cross-platform but used
os.kill(pid, 0) on Windows
hermes_cli/main.py:5792 — CLI process-kill polling
hermes_cli/profiles.py:782 — profile stop wait loop
plugins/google_meet/process_manager.py:74
tools/browser_tool.py:1215, 1255 — browser daemon ownership probes
tools/mcp_tool.py:1255, 3374 — MCP stdio orphan tracking
The watcher source in gateway/run.py:2810 is a multi-line string
that gets spawned as an inline ``python -c "..."`` subprocess, so
it can't import gateway.status. The fix for that callsite inlines
the same ctypes probe directly into the watcher source.
Tested on Windows 10 with the hermes gateway + Telegram bot:
- gateway start → alive
- 5 consecutive ``hermes gateway status`` invocations → gateway
alive after every one, same PID reported each time (37520, 21952)
- gateway.log shows uninterrupted operation; no spurious shutdown
entries; cron ticker and kanban dispatcher still running on
their 60-second cadence
- bot continues answering Telegram messages throughout
Ships alongside an exit-path diagnostic wrapper in
``hermes_cli/gateway.py::run_gateway()`` that captures every way
``asyncio.run(start_gateway(...))`` can return (success, SystemExit,
KeyboardInterrupt, BaseException, atexit) with full traceback to
``logs/gateway-exit-diag.log``. This was used to prove the gateway
was being hard-killed externally (no exit event fired) and should
be kept for future Windows debugging.
Refs: https://bugs.python.org/issue14484
See also: references/windows-subprocess-sigint-storm.md in
the hermes-agent skill.
This commit is contained in:
parent
ac178b78c4
commit
1cbe399149
11 changed files with 296 additions and 151 deletions
|
|
@ -107,12 +107,15 @@ def _kill_stale_bridge_by_pidfile(session_path: Path) -> None:
|
|||
except OSError:
|
||||
pass
|
||||
return
|
||||
try:
|
||||
os.kill(pid, 0) # check existence
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
logger.info("[whatsapp] Killed stale bridge PID %d from pidfile", pid)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
pass
|
||||
# ``os.kill(pid, 0)`` is NOT a no-op on Windows (bpo-14484) — use the
|
||||
# cross-platform existence check before sending a real signal.
|
||||
from gateway.status import _pid_exists
|
||||
if _pid_exists(pid):
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
logger.info("[whatsapp] Killed stale bridge PID %d from pidfile", pid)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
pass
|
||||
try:
|
||||
pid_file.unlink()
|
||||
except OSError:
|
||||
|
|
|
|||
|
|
@ -2805,10 +2805,36 @@ class GatewayRunner:
|
|||
pid = int(sys.argv[1])
|
||||
cmd = sys.argv[2:]
|
||||
deadline = time.monotonic() + 120
|
||||
while time.monotonic() < deadline:
|
||||
|
||||
def _alive(p):
|
||||
# On Windows, os.kill(pid, 0) is NOT a no-op — it maps to
|
||||
# GenerateConsoleCtrlEvent(0, pid) (bpo-14484). Use the
|
||||
# Win32 handle-based existence check instead.
|
||||
if os.name == 'nt':
|
||||
import ctypes
|
||||
k32 = ctypes.windll.kernel32
|
||||
k32.OpenProcess.restype = ctypes.c_void_p
|
||||
k32.WaitForSingleObject.restype = ctypes.c_uint
|
||||
k32.GetLastError.restype = ctypes.c_uint
|
||||
h = k32.OpenProcess(0x1000 | 0x100000, False, int(p))
|
||||
if not h:
|
||||
return k32.GetLastError() != 87
|
||||
try:
|
||||
return k32.WaitForSingleObject(h, 0) == 0x102
|
||||
finally:
|
||||
k32.CloseHandle(h)
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
os.kill(int(p), 0)
|
||||
return True
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
except PermissionError:
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
while time.monotonic() < deadline:
|
||||
if not _alive(pid):
|
||||
break
|
||||
time.sleep(0.2)
|
||||
_CREATE_NEW_PROCESS_GROUP = 0x00000200
|
||||
|
|
@ -15189,16 +15215,14 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
|||
except Exception:
|
||||
pass
|
||||
return False
|
||||
# Wait up to 10 seconds for the old process to exit
|
||||
# Wait up to 10 seconds for the old process to exit.
|
||||
# ``os.kill(pid, 0)`` on Windows is NOT a no-op — use the
|
||||
# handle-based existence check instead.
|
||||
from gateway.status import _pid_exists
|
||||
for _ in range(20):
|
||||
try:
|
||||
os.kill(existing_pid, 0)
|
||||
time.sleep(0.5)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
# OSError covers Windows' WinError 87 "invalid parameter"
|
||||
# for an already-gone PID — without this the probe loop
|
||||
# busy-spins for the full 10s on every --replace start.
|
||||
if not _pid_exists(existing_pid):
|
||||
break # Process is gone
|
||||
time.sleep(0.5)
|
||||
else:
|
||||
# Still alive after 10s — force kill
|
||||
logger.warning(
|
||||
|
|
|
|||
|
|
@ -299,6 +299,71 @@ def _try_acquire_file_lock(handle) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
def _pid_exists(pid: int) -> bool:
|
||||
"""Cross-platform "is this PID alive" check that does NOT kill the target.
|
||||
|
||||
CRITICAL on Windows: Python's ``os.kill(pid, 0)`` is NOT a no-op like it
|
||||
is on POSIX. CPython's Windows implementation
|
||||
(``Modules/posixmodule.c::os_kill_impl``) treats ``sig=0`` as
|
||||
``CTRL_C_EVENT`` because the two values collide at the C level, and
|
||||
routes it through ``GenerateConsoleCtrlEvent(0, pid)`` — which sends
|
||||
a Ctrl+C to the entire console process group containing the target
|
||||
PID, not just the PID itself. Any caller that wanted to "check if
|
||||
this PID is alive" via ``os.kill(pid, 0)`` on Windows was silently
|
||||
killing that process (and often unrelated processes in the same
|
||||
console group). Long-standing Python quirk; see bpo-14484.
|
||||
|
||||
Fix: use the Win32 ``OpenProcess`` / ``WaitForSingleObject`` pair on
|
||||
Windows to check existence without any signal path; use the POSIX
|
||||
``os.kill(pid, 0)`` idiom on POSIX where it actually is a no-op.
|
||||
"""
|
||||
if _IS_WINDOWS:
|
||||
try:
|
||||
import ctypes
|
||||
kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined]
|
||||
# Pin return types — default ctypes restype is c_int (signed),
|
||||
# which mangles WAIT_* DWORD return codes into negative numbers.
|
||||
kernel32.OpenProcess.restype = ctypes.c_void_p
|
||||
kernel32.WaitForSingleObject.restype = ctypes.c_uint
|
||||
kernel32.GetLastError.restype = ctypes.c_uint
|
||||
PROCESS_QUERY_LIMITED_INFORMATION = 0x1000
|
||||
SYNCHRONIZE = 0x100000 # required for WaitForSingleObject
|
||||
WAIT_TIMEOUT = 0x00000102
|
||||
ERROR_INVALID_PARAMETER = 87
|
||||
ERROR_ACCESS_DENIED = 5
|
||||
handle = kernel32.OpenProcess(
|
||||
PROCESS_QUERY_LIMITED_INFORMATION | SYNCHRONIZE, False, int(pid)
|
||||
)
|
||||
if not handle:
|
||||
err = kernel32.GetLastError()
|
||||
if err == ERROR_INVALID_PARAMETER:
|
||||
return False # PID definitely gone
|
||||
if err == ERROR_ACCESS_DENIED:
|
||||
return True # Exists but owned by another user/session
|
||||
return False # Conservative default for unknown errors
|
||||
try:
|
||||
wait_result = kernel32.WaitForSingleObject(handle, 0)
|
||||
# WAIT_TIMEOUT = still running; anything else (WAIT_OBJECT_0
|
||||
# via exit, WAIT_FAILED via handle issue) = treat as gone.
|
||||
return wait_result == WAIT_TIMEOUT
|
||||
finally:
|
||||
kernel32.CloseHandle(handle)
|
||||
except (OSError, AttributeError):
|
||||
return False
|
||||
else:
|
||||
try:
|
||||
os.kill(int(pid), 0)
|
||||
return True
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
except PermissionError:
|
||||
# Process exists but we can't signal it — still alive.
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
|
||||
def _release_file_lock(handle) -> None:
|
||||
try:
|
||||
if _IS_WINDOWS:
|
||||
|
|
@ -503,10 +568,7 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str,
|
|||
|
||||
stale = existing_pid is None
|
||||
if not stale:
|
||||
try:
|
||||
os.kill(existing_pid, 0)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
# Windows raises OSError with WinError 87 for invalid pid check
|
||||
if not _pid_exists(existing_pid):
|
||||
stale = True
|
||||
else:
|
||||
current_start = _get_process_start_time(existing_pid)
|
||||
|
|
@ -517,7 +579,7 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str,
|
|||
):
|
||||
stale = True
|
||||
# Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped
|
||||
# processes still respond to os.kill(pid, 0) but are not
|
||||
# processes still appear alive to _pid_exists but are not
|
||||
# actually running. Treat them as stale so --replace works.
|
||||
if not stale:
|
||||
try:
|
||||
|
|
@ -824,20 +886,7 @@ def get_running_pid(
|
|||
if pid is None:
|
||||
continue
|
||||
|
||||
try:
|
||||
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
|
||||
except ProcessLookupError:
|
||||
continue
|
||||
except PermissionError:
|
||||
# The process exists but belongs to another user/service scope.
|
||||
# With the runtime lock still held, prefer keeping it visible
|
||||
# rather than deleting the PID file as "stale".
|
||||
if _record_looks_like_gateway(record):
|
||||
return pid
|
||||
continue
|
||||
except OSError:
|
||||
# Windows raises OSError with WinError 87 for an invalid pid
|
||||
# (process is definitely gone). Treat as "process doesn't exist".
|
||||
if not _pid_exists(pid):
|
||||
continue
|
||||
|
||||
recorded_start = record.get("start_time")
|
||||
|
|
|
|||
|
|
@ -223,18 +223,15 @@ def _graceful_restart_via_sigusr1(pid: int, drain_timeout: float) -> bool:
|
|||
import time as _time
|
||||
|
||||
deadline = _time.monotonic() + max(drain_timeout, 1.0)
|
||||
# IMPORTANT Windows note: ``os.kill(pid, 0)`` is NOT a no-op on
|
||||
# Windows — Python's implementation calls ``TerminateProcess(handle, 0)``
|
||||
# for sig=0, hard-killing the target. Use the cross-platform
|
||||
# ``_pid_exists`` helper in gateway.status which does OpenProcess +
|
||||
# WaitForSingleObject on Windows.
|
||||
from gateway.status import _pid_exists
|
||||
|
||||
while _time.monotonic() < deadline:
|
||||
try:
|
||||
os.kill(pid, 0) # signal 0 — probe liveness
|
||||
except ProcessLookupError:
|
||||
return True
|
||||
except PermissionError:
|
||||
# Process still exists but we can't signal it. Treat as alive
|
||||
# so the caller falls back.
|
||||
pass
|
||||
except OSError:
|
||||
# Windows raises OSError (WinError 87 "invalid parameter") for
|
||||
# a gone PID — treat the same as ProcessLookupError.
|
||||
if not _pid_exists(pid):
|
||||
return True
|
||||
_time.sleep(0.5)
|
||||
# Drain didn't finish in time.
|
||||
|
|
@ -303,6 +300,11 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
|
|||
or f"HERMES_HOME={current_home}" in command
|
||||
)
|
||||
|
||||
# Default-profile case: no profile flag in argv. Accept as long as
|
||||
# the command doesn't advertise *some other* profile. HERMES_HOME
|
||||
# may be passed via env (not visible in wmic/CIM command line) so
|
||||
# its absence is NOT disqualifying — only a non-matching explicit
|
||||
# HERMES_HOME= in argv is.
|
||||
if "--profile " in command or " -p " in command:
|
||||
return False
|
||||
if "HERMES_HOME=" in command and f"HERMES_HOME={current_home}" not in command:
|
||||
|
|
@ -513,14 +515,10 @@ def launch_detached_profile_gateway_restart(profile: str, old_pid: int) -> bool:
|
|||
cmd = sys.argv[2:]
|
||||
deadline = time.monotonic() + 120
|
||||
while time.monotonic() < deadline:
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except ProcessLookupError:
|
||||
break
|
||||
except PermissionError:
|
||||
pass
|
||||
except OSError:
|
||||
# Windows: gone PID raises OSError (WinError 87).
|
||||
# ``os.kill(pid, 0)`` is not a no-op on Windows — use the
|
||||
# cross-platform existence check.
|
||||
from gateway.status import _pid_exists
|
||||
if not _pid_exists(pid):
|
||||
break
|
||||
time.sleep(0.2)
|
||||
|
||||
|
|
@ -1007,15 +1005,14 @@ def stop_profile_gateway() -> bool:
|
|||
print(f"⚠ Permission denied to kill PID {pid}")
|
||||
return False
|
||||
|
||||
# Wait briefly for it to exit
|
||||
# Wait briefly for it to exit. On Windows, os.kill(pid, 0) is NOT
|
||||
# a no-op — route through the cross-platform existence check.
|
||||
import time as _time
|
||||
from gateway.status import _pid_exists
|
||||
for _ in range(20):
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
_time.sleep(0.5)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
# OSError covers Windows' WinError 87 for gone PIDs.
|
||||
if not _pid_exists(pid):
|
||||
break
|
||||
_time.sleep(0.5)
|
||||
|
||||
if get_running_pid() is None:
|
||||
remove_pid_file()
|
||||
|
|
@ -2910,22 +2907,49 @@ def run_gateway(verbose: int = 0, quiet: bool = False, replace: bool = False):
|
|||
# stays alive; real user Ctrl+C still comes through prompt_toolkit /
|
||||
# the asyncio signal handler when running in a real console.
|
||||
#
|
||||
# Detection: sys.stdin is None / stdin-not-a-tty AND we're on Windows.
|
||||
# This mirrors the same pattern as ``cli.py`` (commit 449ad952b).
|
||||
# IMPORTANT lesson (May 2026): we originally gated this on "stdin is
|
||||
# NOT a TTY" assuming only detached pythonw runs would be vulnerable.
|
||||
# Wrong. When the user runs `hermes gateway start` from a PowerShell
|
||||
# console, the gateway inherits that console and stdin IS a TTY —
|
||||
# but it's STILL vulnerable to CTRL_C_EVENT broadcast by any sibling
|
||||
# `hermes` invocation (like `hermes gateway status` 30 seconds later)
|
||||
# because Windows routes console events to all processes sharing the
|
||||
# console. Every hermes CLI process after that sibling fires is a
|
||||
# potential drive-by killer. So on Windows, for `gateway run`
|
||||
# specifically (never interactive by design), always install the
|
||||
# SIGINT absorber regardless of TTY state.
|
||||
try:
|
||||
_stdin_is_tty = bool(sys.stdin and sys.stdin.isatty())
|
||||
except (ValueError, OSError):
|
||||
_stdin_is_tty = False
|
||||
if is_windows():
|
||||
try:
|
||||
_stdin_is_tty = bool(sys.stdin and sys.stdin.isatty())
|
||||
except (ValueError, OSError):
|
||||
_stdin_is_tty = False
|
||||
if not _stdin_is_tty:
|
||||
try:
|
||||
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
||||
if hasattr(signal, "SIGBREAK"):
|
||||
signal.signal(signal.SIGBREAK, signal.SIG_IGN)
|
||||
except (OSError, ValueError):
|
||||
# SetConsoleCtrlHandler not available (rare on Windows) —
|
||||
# best-effort, proceed either way.
|
||||
pass
|
||||
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
||||
if hasattr(signal, "SIGBREAK"):
|
||||
signal.signal(signal.SIGBREAK, signal.SIG_IGN)
|
||||
except (OSError, ValueError):
|
||||
# SetConsoleCtrlHandler not available (rare on Windows) —
|
||||
# best-effort, proceed either way.
|
||||
pass
|
||||
# Python's signal module only hooks SIGINT/SIGBREAK. To also
|
||||
# absorb CTRL_CLOSE_EVENT / CTRL_LOGOFF_EVENT and any other
|
||||
# console control signals Windows may broadcast to the console
|
||||
# process group, call the native SetConsoleCtrlHandler(NULL, TRUE)
|
||||
# — this tells the kernel to IGNORE all console control events
|
||||
# for this process entirely, which is what background services
|
||||
# are supposed to do. Belt-and-braces over the Python-level
|
||||
# handlers above.
|
||||
try:
|
||||
import ctypes
|
||||
kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined]
|
||||
# BOOL SetConsoleCtrlHandler(NULL, Add) — Add=TRUE means
|
||||
# "install the NULL handler", which has the documented
|
||||
# effect of ignoring Ctrl+C. Called twice for defense in
|
||||
# depth: once before any Python import could have flipped
|
||||
# our disposition, once as our last word.
|
||||
kernel32.SetConsoleCtrlHandler(None, 1)
|
||||
except (OSError, AttributeError):
|
||||
pass
|
||||
|
||||
# Refresh the systemd unit definition on every boot so that restart
|
||||
# settings (RestartSec, StartLimitIntervalSec, etc.) stay current even
|
||||
|
|
@ -2954,15 +2978,86 @@ def run_gateway(verbose: int = 0, quiet: bool = False, replace: bool = False):
|
|||
# Exit with code 1 if gateway fails to connect any platform,
|
||||
# so systemd Restart=always will retry on transient errors
|
||||
verbosity = None if quiet else verbose
|
||||
|
||||
# ── Exit-path diagnostics ────────────────────────────────────────────
|
||||
# When the gateway dies silently on Windows (no shutdown log, no
|
||||
# traceback in gateway.log / errors.log), we're usually blind to the
|
||||
# cause. The code below captures *every* way the asyncio.run() call
|
||||
# below can return, with full context dumped to a dedicated log so
|
||||
# the next silent death yields evidence instead of a mystery. This
|
||||
# is diagnostic scaffolding; cheap to keep on, costs nothing during
|
||||
# normal operation, and the emitted lines are opt-in via the
|
||||
# HERMES_GATEWAY_EXIT_DIAG env var (default: on while we're still
|
||||
# chasing the Windows lifecycle bug).
|
||||
import atexit as _atexit
|
||||
import traceback as _traceback
|
||||
from datetime import datetime as _dt, timezone as _tz
|
||||
|
||||
def _exit_diag(tag: str, **extra: object) -> None:
|
||||
if os.environ.get("HERMES_GATEWAY_EXIT_DIAG", "1") != "1":
|
||||
return
|
||||
try:
|
||||
from hermes_constants import get_hermes_home as _ghh
|
||||
log_dir = _ghh() / "logs"
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
ts = _dt.now(_tz.utc).isoformat()
|
||||
line = {
|
||||
"ts": ts,
|
||||
"tag": tag,
|
||||
"pid": os.getpid(),
|
||||
"python": sys.version.split()[0],
|
||||
"platform": sys.platform,
|
||||
**extra,
|
||||
}
|
||||
import json as _json
|
||||
with open(log_dir / "gateway-exit-diag.log", "a", encoding="utf-8") as f:
|
||||
f.write(_json.dumps(line, default=str) + "\n")
|
||||
except Exception:
|
||||
pass # never let the diagnostic itself crash the gateway
|
||||
|
||||
_exit_diag(
|
||||
"gateway.start",
|
||||
replace=replace,
|
||||
argv=sys.argv,
|
||||
stdin_is_tty=_stdin_is_tty,
|
||||
)
|
||||
|
||||
def _atexit_hook() -> None:
|
||||
_exit_diag("atexit.hook", sys_exc=repr(sys.exc_info()))
|
||||
|
||||
_atexit.register(_atexit_hook)
|
||||
|
||||
success = False
|
||||
try:
|
||||
success = asyncio.run(start_gateway(replace=replace, verbosity=verbosity))
|
||||
_exit_diag("asyncio.run.returned", success=success)
|
||||
except KeyboardInterrupt:
|
||||
# On Windows-detached runs this shouldn't fire (we absorb SIGINT above),
|
||||
# but keep the handler for console runs.
|
||||
_exit_diag(
|
||||
"asyncio.run.KeyboardInterrupt",
|
||||
traceback=_traceback.format_exc(),
|
||||
)
|
||||
print("\nGateway stopped.")
|
||||
return
|
||||
except SystemExit as e:
|
||||
_exit_diag("asyncio.run.SystemExit", code=getattr(e, "code", None),
|
||||
traceback=_traceback.format_exc())
|
||||
raise
|
||||
except BaseException as e:
|
||||
# Absolutely everything else: Exception, asyncio.CancelledError,
|
||||
# even exotic BaseException subclasses. We want the cause logged.
|
||||
_exit_diag(
|
||||
"asyncio.run.exception",
|
||||
exc_type=type(e).__name__,
|
||||
exc_repr=repr(e),
|
||||
traceback=_traceback.format_exc(),
|
||||
)
|
||||
raise
|
||||
if not success:
|
||||
_exit_diag("gateway.exit_nonzero")
|
||||
sys.exit(1)
|
||||
_exit_diag("gateway.exit_clean")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
|
|
|
|||
|
|
@ -2805,12 +2805,18 @@ def _classify_worker_exit(pid: int) -> "tuple[str, Optional[int]]":
|
|||
def _pid_alive(pid: Optional[int]) -> bool:
|
||||
"""Return True if ``pid`` is still running on this host.
|
||||
|
||||
Cross-platform: uses ``os.kill(pid, 0)`` on POSIX and ``OpenProcess``
|
||||
on Windows. Returns False for falsy PIDs or on any OS error.
|
||||
Cross-platform: uses ``OpenProcess`` + ``WaitForSingleObject`` on
|
||||
Windows (via ``gateway.status._pid_exists``) and ``os.kill(pid, 0)``
|
||||
on POSIX. Returns False for falsy PIDs or on any OS error.
|
||||
|
||||
**Zombie handling:** ``os.kill(pid, 0)`` succeeds against
|
||||
zombie processes (post-exit, pre-reap) because the process table
|
||||
entry still exists. A worker that exits without being reaped by its
|
||||
**DO NOT** use ``os.kill(pid, 0)`` directly on Windows — Python's
|
||||
Windows ``os.kill`` treats ``sig=0`` as ``CTRL_C_EVENT`` (bpo-14484)
|
||||
and will broadcast it to the target's console group, potentially
|
||||
killing unrelated processes.
|
||||
|
||||
**Zombie handling:** the existence check succeeds against zombie
|
||||
processes (post-exit, pre-reap) because the process table entry
|
||||
still exists. A worker that exits without being reaped by its
|
||||
parent would stay "alive" to the dispatcher forever. Dispatcher
|
||||
workers are started via ``start_new_session=True`` + intentional
|
||||
Popen handle abandonment, so init reaps them quickly — but during
|
||||
|
|
@ -2821,17 +2827,10 @@ def _pid_alive(pid: Optional[int]) -> bool:
|
|||
"""
|
||||
if not pid or pid <= 0:
|
||||
return False
|
||||
try:
|
||||
if hasattr(os, "kill"):
|
||||
os.kill(int(pid), 0)
|
||||
except ProcessLookupError:
|
||||
from gateway.status import _pid_exists
|
||||
if not _pid_exists(int(pid)):
|
||||
return False
|
||||
except PermissionError:
|
||||
# Process exists, we just can't signal it.
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
# Still here → kill(0) succeeded. Check for zombie on platforms
|
||||
# Still here → process exists. Check for zombie on platforms
|
||||
# where we have a cheap, deterministic process-state probe.
|
||||
if sys.platform == "linux":
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -5787,16 +5787,14 @@ def _kill_stale_dashboard_processes(
|
|||
while pending and _time.monotonic() < deadline:
|
||||
_time.sleep(0.1)
|
||||
still_pending = []
|
||||
# On Windows, os.kill(pid, 0) is NOT a no-op. Route through
|
||||
# the cross-platform existence check.
|
||||
from gateway.status import _pid_exists
|
||||
for pid in pending:
|
||||
try:
|
||||
os.kill(pid, 0) # probe
|
||||
except ProcessLookupError:
|
||||
killed.append(pid)
|
||||
except (PermissionError, OSError):
|
||||
# Can't probe — assume still there.
|
||||
if _pid_exists(pid):
|
||||
still_pending.append(pid)
|
||||
else:
|
||||
still_pending.append(pid)
|
||||
killed.append(pid)
|
||||
pending = still_pending
|
||||
|
||||
# SIGKILL any survivors.
|
||||
|
|
|
|||
|
|
@ -774,15 +774,13 @@ def _stop_gateway_process(profile_dir: Path) -> None:
|
|||
# and raw os.kill with SIGTERM doesn't cascade to child processes
|
||||
# the same way taskkill /T does.
|
||||
from gateway.status import terminate_pid as _terminate_pid
|
||||
from gateway.status import _pid_exists
|
||||
_terminate_pid(pid) # graceful first
|
||||
# Wait up to 10s for graceful shutdown
|
||||
# Wait up to 10s for graceful shutdown. On Windows, os.kill(pid, 0)
|
||||
# is NOT a no-op — use the handle-based existence check.
|
||||
for _ in range(20):
|
||||
_time.sleep(0.5)
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except (ProcessLookupError, OSError):
|
||||
# OSError covers Windows' WinError 87 "invalid parameter"
|
||||
# returned for an invalid/gone PID probe.
|
||||
if not _pid_exists(pid):
|
||||
print(f"✓ Gateway stopped (PID {pid})")
|
||||
return
|
||||
# Force kill
|
||||
|
|
|
|||
|
|
@ -70,14 +70,11 @@ def _clear_active() -> None:
|
|||
|
||||
|
||||
def _pid_alive(pid: int) -> bool:
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
except PermissionError:
|
||||
# Process exists but we can't signal it — treat as alive.
|
||||
return True
|
||||
return True
|
||||
# ``os.kill(pid, 0)`` is NOT a no-op on Windows (bpo-14484) — it
|
||||
# routes through GenerateConsoleCtrlEvent and can kill the target.
|
||||
# Use the cross-platform existence check.
|
||||
from gateway.status import _pid_exists
|
||||
return _pid_exists(pid)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -1211,19 +1211,10 @@ def _reap_orphaned_browser_sessions():
|
|||
if os.path.isfile(owner_pid_file):
|
||||
try:
|
||||
owner_pid = int(Path(owner_pid_file).read_text(encoding="utf-8").strip())
|
||||
try:
|
||||
os.kill(owner_pid, 0)
|
||||
owner_alive = True
|
||||
except ProcessLookupError:
|
||||
owner_alive = False
|
||||
except PermissionError:
|
||||
# Owner exists but we can't signal it (different uid).
|
||||
# Treat as alive — don't reap someone else's session.
|
||||
owner_alive = True
|
||||
except OSError:
|
||||
# Windows: gone PID raises OSError (WinError 87) instead
|
||||
# of ProcessLookupError. Treat as dead to match POSIX.
|
||||
owner_alive = False
|
||||
# ``os.kill(pid, 0)`` is NOT a no-op on Windows (bpo-14484).
|
||||
# Use the cross-platform existence check.
|
||||
from gateway.status import _pid_exists
|
||||
owner_alive = _pid_exists(owner_pid)
|
||||
except (ValueError, OSError):
|
||||
owner_alive = None # corrupt file — fall through
|
||||
|
||||
|
|
@ -1250,19 +1241,10 @@ def _reap_orphaned_browser_sessions():
|
|||
shutil.rmtree(socket_dir, ignore_errors=True)
|
||||
continue
|
||||
|
||||
# Check if the daemon is still alive
|
||||
try:
|
||||
os.kill(daemon_pid, 0) # signal 0 = existence check
|
||||
except ProcessLookupError:
|
||||
# Already dead, just clean up the dir
|
||||
shutil.rmtree(socket_dir, ignore_errors=True)
|
||||
continue
|
||||
except PermissionError:
|
||||
# Alive but owned by someone else — leave it alone
|
||||
continue
|
||||
except OSError:
|
||||
# Windows raises OSError (WinError 87) for a gone PID — treat
|
||||
# as dead and clean up, mirroring the ProcessLookupError branch.
|
||||
# Check if the daemon is still alive. ``os.kill(pid, 0)`` on Windows
|
||||
# is NOT a no-op — use the handle-based existence check.
|
||||
from gateway.status import _pid_exists
|
||||
if not _pid_exists(daemon_pid):
|
||||
shutil.rmtree(socket_dir, ignore_errors=True)
|
||||
continue
|
||||
|
||||
|
|
|
|||
|
|
@ -1251,9 +1251,10 @@ class MCPServerTask:
|
|||
for _pid in new_pids:
|
||||
_stdio_pids.pop(_pid, None)
|
||||
for pid in new_pids:
|
||||
try:
|
||||
os.kill(pid, 0) # signal 0: probe liveness only
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
# ``os.kill(pid, 0)`` is NOT a no-op on Windows
|
||||
# (bpo-14484). Use the cross-platform check.
|
||||
from gateway.status import _pid_exists
|
||||
if not _pid_exists(pid):
|
||||
continue # process already exited — nothing to do
|
||||
_orphan_stdio_pids.add(pid)
|
||||
|
||||
|
|
@ -3369,16 +3370,20 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None:
|
|||
|
||||
# Phase 3: SIGKILL any survivors
|
||||
_sigkill = getattr(_signal, "SIGKILL", _signal.SIGTERM)
|
||||
# ``os.kill(pid, 0)`` is NOT a no-op on Windows. Use the cross-platform
|
||||
# existence check before escalating to SIGKILL.
|
||||
from gateway.status import _pid_exists
|
||||
for pid, server_name in pids.items():
|
||||
if not _pid_exists(pid):
|
||||
continue # Good — exited after SIGTERM
|
||||
try:
|
||||
os.kill(pid, 0) # Check if still alive
|
||||
os.kill(pid, _sigkill)
|
||||
logger.warning(
|
||||
"Force-killed MCP process %d (%s) after SIGTERM timeout",
|
||||
pid, server_name,
|
||||
)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
pass # Good — exited after SIGTERM
|
||||
pass
|
||||
|
||||
|
||||
def _stop_mcp_loop():
|
||||
|
|
|
|||
|
|
@ -404,15 +404,10 @@ class ProcessRegistry:
|
|||
"""Best-effort liveness check for host-visible PIDs."""
|
||||
if not pid:
|
||||
return False
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
return True
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
# OSError covers Windows' WinError 87 for a gone PID, and the
|
||||
# ``WinError 5 Access denied`` case — treat both as "can't probe
|
||||
# or process is gone", which matches the conservative
|
||||
# "not alive" semantics callers already handle.
|
||||
return False
|
||||
# ``os.kill(pid, 0)`` is NOT a no-op on Windows (bpo-14484) — use
|
||||
# the cross-platform existence check.
|
||||
from gateway.status import _pid_exists
|
||||
return _pid_exists(pid)
|
||||
|
||||
def _refresh_detached_session(self, session: Optional[ProcessSession]) -> Optional[ProcessSession]:
|
||||
"""Update recovered host-PID sessions when the underlying process has exited."""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue