diff --git a/gateway/platforms/whatsapp.py b/gateway/platforms/whatsapp.py index fda25de410..5537e87ab4 100644 --- a/gateway/platforms/whatsapp.py +++ b/gateway/platforms/whatsapp.py @@ -107,12 +107,15 @@ def _kill_stale_bridge_by_pidfile(session_path: Path) -> None: except OSError: pass return - try: - os.kill(pid, 0) # check existence - os.kill(pid, signal.SIGTERM) - logger.info("[whatsapp] Killed stale bridge PID %d from pidfile", pid) - except (ProcessLookupError, PermissionError, OSError): - pass + # ``os.kill(pid, 0)`` is NOT a no-op on Windows (bpo-14484) — use the + # cross-platform existence check before sending a real signal. + from gateway.status import _pid_exists + if _pid_exists(pid): + try: + os.kill(pid, signal.SIGTERM) + logger.info("[whatsapp] Killed stale bridge PID %d from pidfile", pid) + except (ProcessLookupError, PermissionError, OSError): + pass try: pid_file.unlink() except OSError: diff --git a/gateway/run.py b/gateway/run.py index 0067e7bb68..cc534af363 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2805,10 +2805,36 @@ class GatewayRunner: pid = int(sys.argv[1]) cmd = sys.argv[2:] deadline = time.monotonic() + 120 - while time.monotonic() < deadline: + + def _alive(p): + # On Windows, os.kill(pid, 0) is NOT a no-op — it maps to + # GenerateConsoleCtrlEvent(0, pid) (bpo-14484). Use the + # Win32 handle-based existence check instead. + if os.name == 'nt': + import ctypes + k32 = ctypes.windll.kernel32 + k32.OpenProcess.restype = ctypes.c_void_p + k32.WaitForSingleObject.restype = ctypes.c_uint + k32.GetLastError.restype = ctypes.c_uint + h = k32.OpenProcess(0x1000 | 0x100000, False, int(p)) + if not h: + return k32.GetLastError() != 87 + try: + return k32.WaitForSingleObject(h, 0) == 0x102 + finally: + k32.CloseHandle(h) try: - os.kill(pid, 0) - except (ProcessLookupError, PermissionError, OSError): + os.kill(int(p), 0) + return True + except ProcessLookupError: + return False + except PermissionError: + return True + except OSError: + return False + + while time.monotonic() < deadline: + if not _alive(pid): break time.sleep(0.2) _CREATE_NEW_PROCESS_GROUP = 0x00000200 @@ -15189,16 +15215,14 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = except Exception: pass return False - # Wait up to 10 seconds for the old process to exit + # Wait up to 10 seconds for the old process to exit. + # ``os.kill(pid, 0)`` on Windows is NOT a no-op — use the + # handle-based existence check instead. + from gateway.status import _pid_exists for _ in range(20): - try: - os.kill(existing_pid, 0) - time.sleep(0.5) - except (ProcessLookupError, PermissionError, OSError): - # OSError covers Windows' WinError 87 "invalid parameter" - # for an already-gone PID — without this the probe loop - # busy-spins for the full 10s on every --replace start. + if not _pid_exists(existing_pid): break # Process is gone + time.sleep(0.5) else: # Still alive after 10s — force kill logger.warning( diff --git a/gateway/status.py b/gateway/status.py index 33799bd16a..5e9b615d63 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -299,6 +299,71 @@ def _try_acquire_file_lock(handle) -> bool: return False +def _pid_exists(pid: int) -> bool: + """Cross-platform "is this PID alive" check that does NOT kill the target. + + CRITICAL on Windows: Python's ``os.kill(pid, 0)`` is NOT a no-op like it + is on POSIX. CPython's Windows implementation + (``Modules/posixmodule.c::os_kill_impl``) treats ``sig=0`` as + ``CTRL_C_EVENT`` because the two values collide at the C level, and + routes it through ``GenerateConsoleCtrlEvent(0, pid)`` — which sends + a Ctrl+C to the entire console process group containing the target + PID, not just the PID itself. Any caller that wanted to "check if + this PID is alive" via ``os.kill(pid, 0)`` on Windows was silently + killing that process (and often unrelated processes in the same + console group). Long-standing Python quirk; see bpo-14484. + + Fix: use the Win32 ``OpenProcess`` / ``WaitForSingleObject`` pair on + Windows to check existence without any signal path; use the POSIX + ``os.kill(pid, 0)`` idiom on POSIX where it actually is a no-op. + """ + if _IS_WINDOWS: + try: + import ctypes + kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined] + # Pin return types — default ctypes restype is c_int (signed), + # which mangles WAIT_* DWORD return codes into negative numbers. + kernel32.OpenProcess.restype = ctypes.c_void_p + kernel32.WaitForSingleObject.restype = ctypes.c_uint + kernel32.GetLastError.restype = ctypes.c_uint + PROCESS_QUERY_LIMITED_INFORMATION = 0x1000 + SYNCHRONIZE = 0x100000 # required for WaitForSingleObject + WAIT_TIMEOUT = 0x00000102 + ERROR_INVALID_PARAMETER = 87 + ERROR_ACCESS_DENIED = 5 + handle = kernel32.OpenProcess( + PROCESS_QUERY_LIMITED_INFORMATION | SYNCHRONIZE, False, int(pid) + ) + if not handle: + err = kernel32.GetLastError() + if err == ERROR_INVALID_PARAMETER: + return False # PID definitely gone + if err == ERROR_ACCESS_DENIED: + return True # Exists but owned by another user/session + return False # Conservative default for unknown errors + try: + wait_result = kernel32.WaitForSingleObject(handle, 0) + # WAIT_TIMEOUT = still running; anything else (WAIT_OBJECT_0 + # via exit, WAIT_FAILED via handle issue) = treat as gone. + return wait_result == WAIT_TIMEOUT + finally: + kernel32.CloseHandle(handle) + except (OSError, AttributeError): + return False + else: + try: + os.kill(int(pid), 0) + return True + except ProcessLookupError: + return False + except PermissionError: + # Process exists but we can't signal it — still alive. + return True + except OSError: + return False + + + def _release_file_lock(handle) -> None: try: if _IS_WINDOWS: @@ -503,10 +568,7 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, stale = existing_pid is None if not stale: - try: - os.kill(existing_pid, 0) - except (ProcessLookupError, PermissionError, OSError): - # Windows raises OSError with WinError 87 for invalid pid check + if not _pid_exists(existing_pid): stale = True else: current_start = _get_process_start_time(existing_pid) @@ -517,7 +579,7 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, ): stale = True # Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped - # processes still respond to os.kill(pid, 0) but are not + # processes still appear alive to _pid_exists but are not # actually running. Treat them as stale so --replace works. if not stale: try: @@ -824,20 +886,7 @@ def get_running_pid( if pid is None: continue - try: - os.kill(pid, 0) # signal 0 = existence check, no actual signal sent - except ProcessLookupError: - continue - except PermissionError: - # The process exists but belongs to another user/service scope. - # With the runtime lock still held, prefer keeping it visible - # rather than deleting the PID file as "stale". - if _record_looks_like_gateway(record): - return pid - continue - except OSError: - # Windows raises OSError with WinError 87 for an invalid pid - # (process is definitely gone). Treat as "process doesn't exist". + if not _pid_exists(pid): continue recorded_start = record.get("start_time") diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index b9f44030e4..cfa9d66df8 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -223,18 +223,15 @@ def _graceful_restart_via_sigusr1(pid: int, drain_timeout: float) -> bool: import time as _time deadline = _time.monotonic() + max(drain_timeout, 1.0) + # IMPORTANT Windows note: ``os.kill(pid, 0)`` is NOT a no-op on + # Windows — Python's implementation calls ``TerminateProcess(handle, 0)`` + # for sig=0, hard-killing the target. Use the cross-platform + # ``_pid_exists`` helper in gateway.status which does OpenProcess + + # WaitForSingleObject on Windows. + from gateway.status import _pid_exists + while _time.monotonic() < deadline: - try: - os.kill(pid, 0) # signal 0 — probe liveness - except ProcessLookupError: - return True - except PermissionError: - # Process still exists but we can't signal it. Treat as alive - # so the caller falls back. - pass - except OSError: - # Windows raises OSError (WinError 87 "invalid parameter") for - # a gone PID — treat the same as ProcessLookupError. + if not _pid_exists(pid): return True _time.sleep(0.5) # Drain didn't finish in time. @@ -303,6 +300,11 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li or f"HERMES_HOME={current_home}" in command ) + # Default-profile case: no profile flag in argv. Accept as long as + # the command doesn't advertise *some other* profile. HERMES_HOME + # may be passed via env (not visible in wmic/CIM command line) so + # its absence is NOT disqualifying — only a non-matching explicit + # HERMES_HOME= in argv is. if "--profile " in command or " -p " in command: return False if "HERMES_HOME=" in command and f"HERMES_HOME={current_home}" not in command: @@ -513,14 +515,10 @@ def launch_detached_profile_gateway_restart(profile: str, old_pid: int) -> bool: cmd = sys.argv[2:] deadline = time.monotonic() + 120 while time.monotonic() < deadline: - try: - os.kill(pid, 0) - except ProcessLookupError: - break - except PermissionError: - pass - except OSError: - # Windows: gone PID raises OSError (WinError 87). + # ``os.kill(pid, 0)`` is not a no-op on Windows — use the + # cross-platform existence check. + from gateway.status import _pid_exists + if not _pid_exists(pid): break time.sleep(0.2) @@ -1007,15 +1005,14 @@ def stop_profile_gateway() -> bool: print(f"⚠ Permission denied to kill PID {pid}") return False - # Wait briefly for it to exit + # Wait briefly for it to exit. On Windows, os.kill(pid, 0) is NOT + # a no-op — route through the cross-platform existence check. import time as _time + from gateway.status import _pid_exists for _ in range(20): - try: - os.kill(pid, 0) - _time.sleep(0.5) - except (ProcessLookupError, PermissionError, OSError): - # OSError covers Windows' WinError 87 for gone PIDs. + if not _pid_exists(pid): break + _time.sleep(0.5) if get_running_pid() is None: remove_pid_file() @@ -2910,22 +2907,49 @@ def run_gateway(verbose: int = 0, quiet: bool = False, replace: bool = False): # stays alive; real user Ctrl+C still comes through prompt_toolkit / # the asyncio signal handler when running in a real console. # - # Detection: sys.stdin is None / stdin-not-a-tty AND we're on Windows. - # This mirrors the same pattern as ``cli.py`` (commit 449ad952b). + # IMPORTANT lesson (May 2026): we originally gated this on "stdin is + # NOT a TTY" assuming only detached pythonw runs would be vulnerable. + # Wrong. When the user runs `hermes gateway start` from a PowerShell + # console, the gateway inherits that console and stdin IS a TTY — + # but it's STILL vulnerable to CTRL_C_EVENT broadcast by any sibling + # `hermes` invocation (like `hermes gateway status` 30 seconds later) + # because Windows routes console events to all processes sharing the + # console. Every hermes CLI process after that sibling fires is a + # potential drive-by killer. So on Windows, for `gateway run` + # specifically (never interactive by design), always install the + # SIGINT absorber regardless of TTY state. + try: + _stdin_is_tty = bool(sys.stdin and sys.stdin.isatty()) + except (ValueError, OSError): + _stdin_is_tty = False if is_windows(): try: - _stdin_is_tty = bool(sys.stdin and sys.stdin.isatty()) - except (ValueError, OSError): - _stdin_is_tty = False - if not _stdin_is_tty: - try: - signal.signal(signal.SIGINT, signal.SIG_IGN) - if hasattr(signal, "SIGBREAK"): - signal.signal(signal.SIGBREAK, signal.SIG_IGN) - except (OSError, ValueError): - # SetConsoleCtrlHandler not available (rare on Windows) — - # best-effort, proceed either way. - pass + signal.signal(signal.SIGINT, signal.SIG_IGN) + if hasattr(signal, "SIGBREAK"): + signal.signal(signal.SIGBREAK, signal.SIG_IGN) + except (OSError, ValueError): + # SetConsoleCtrlHandler not available (rare on Windows) — + # best-effort, proceed either way. + pass + # Python's signal module only hooks SIGINT/SIGBREAK. To also + # absorb CTRL_CLOSE_EVENT / CTRL_LOGOFF_EVENT and any other + # console control signals Windows may broadcast to the console + # process group, call the native SetConsoleCtrlHandler(NULL, TRUE) + # — this tells the kernel to IGNORE all console control events + # for this process entirely, which is what background services + # are supposed to do. Belt-and-braces over the Python-level + # handlers above. + try: + import ctypes + kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined] + # BOOL SetConsoleCtrlHandler(NULL, Add) — Add=TRUE means + # "install the NULL handler", which has the documented + # effect of ignoring Ctrl+C. Called twice for defense in + # depth: once before any Python import could have flipped + # our disposition, once as our last word. + kernel32.SetConsoleCtrlHandler(None, 1) + except (OSError, AttributeError): + pass # Refresh the systemd unit definition on every boot so that restart # settings (RestartSec, StartLimitIntervalSec, etc.) stay current even @@ -2954,15 +2978,86 @@ def run_gateway(verbose: int = 0, quiet: bool = False, replace: bool = False): # Exit with code 1 if gateway fails to connect any platform, # so systemd Restart=always will retry on transient errors verbosity = None if quiet else verbose + + # ── Exit-path diagnostics ──────────────────────────────────────────── + # When the gateway dies silently on Windows (no shutdown log, no + # traceback in gateway.log / errors.log), we're usually blind to the + # cause. The code below captures *every* way the asyncio.run() call + # below can return, with full context dumped to a dedicated log so + # the next silent death yields evidence instead of a mystery. This + # is diagnostic scaffolding; cheap to keep on, costs nothing during + # normal operation, and the emitted lines are opt-in via the + # HERMES_GATEWAY_EXIT_DIAG env var (default: on while we're still + # chasing the Windows lifecycle bug). + import atexit as _atexit + import traceback as _traceback + from datetime import datetime as _dt, timezone as _tz + + def _exit_diag(tag: str, **extra: object) -> None: + if os.environ.get("HERMES_GATEWAY_EXIT_DIAG", "1") != "1": + return + try: + from hermes_constants import get_hermes_home as _ghh + log_dir = _ghh() / "logs" + log_dir.mkdir(parents=True, exist_ok=True) + ts = _dt.now(_tz.utc).isoformat() + line = { + "ts": ts, + "tag": tag, + "pid": os.getpid(), + "python": sys.version.split()[0], + "platform": sys.platform, + **extra, + } + import json as _json + with open(log_dir / "gateway-exit-diag.log", "a", encoding="utf-8") as f: + f.write(_json.dumps(line, default=str) + "\n") + except Exception: + pass # never let the diagnostic itself crash the gateway + + _exit_diag( + "gateway.start", + replace=replace, + argv=sys.argv, + stdin_is_tty=_stdin_is_tty, + ) + + def _atexit_hook() -> None: + _exit_diag("atexit.hook", sys_exc=repr(sys.exc_info())) + + _atexit.register(_atexit_hook) + + success = False try: success = asyncio.run(start_gateway(replace=replace, verbosity=verbosity)) + _exit_diag("asyncio.run.returned", success=success) except KeyboardInterrupt: # On Windows-detached runs this shouldn't fire (we absorb SIGINT above), # but keep the handler for console runs. + _exit_diag( + "asyncio.run.KeyboardInterrupt", + traceback=_traceback.format_exc(), + ) print("\nGateway stopped.") return + except SystemExit as e: + _exit_diag("asyncio.run.SystemExit", code=getattr(e, "code", None), + traceback=_traceback.format_exc()) + raise + except BaseException as e: + # Absolutely everything else: Exception, asyncio.CancelledError, + # even exotic BaseException subclasses. We want the cause logged. + _exit_diag( + "asyncio.run.exception", + exc_type=type(e).__name__, + exc_repr=repr(e), + traceback=_traceback.format_exc(), + ) + raise if not success: + _exit_diag("gateway.exit_nonzero") sys.exit(1) + _exit_diag("gateway.exit_clean") # ============================================================================= diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index a66e090769..06b4f96ceb 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -2805,12 +2805,18 @@ def _classify_worker_exit(pid: int) -> "tuple[str, Optional[int]]": def _pid_alive(pid: Optional[int]) -> bool: """Return True if ``pid`` is still running on this host. - Cross-platform: uses ``os.kill(pid, 0)`` on POSIX and ``OpenProcess`` - on Windows. Returns False for falsy PIDs or on any OS error. + Cross-platform: uses ``OpenProcess`` + ``WaitForSingleObject`` on + Windows (via ``gateway.status._pid_exists``) and ``os.kill(pid, 0)`` + on POSIX. Returns False for falsy PIDs or on any OS error. - **Zombie handling:** ``os.kill(pid, 0)`` succeeds against - zombie processes (post-exit, pre-reap) because the process table - entry still exists. A worker that exits without being reaped by its + **DO NOT** use ``os.kill(pid, 0)`` directly on Windows — Python's + Windows ``os.kill`` treats ``sig=0`` as ``CTRL_C_EVENT`` (bpo-14484) + and will broadcast it to the target's console group, potentially + killing unrelated processes. + + **Zombie handling:** the existence check succeeds against zombie + processes (post-exit, pre-reap) because the process table entry + still exists. A worker that exits without being reaped by its parent would stay "alive" to the dispatcher forever. Dispatcher workers are started via ``start_new_session=True`` + intentional Popen handle abandonment, so init reaps them quickly — but during @@ -2821,17 +2827,10 @@ def _pid_alive(pid: Optional[int]) -> bool: """ if not pid or pid <= 0: return False - try: - if hasattr(os, "kill"): - os.kill(int(pid), 0) - except ProcessLookupError: + from gateway.status import _pid_exists + if not _pid_exists(int(pid)): return False - except PermissionError: - # Process exists, we just can't signal it. - return True - except OSError: - return False - # Still here → kill(0) succeeded. Check for zombie on platforms + # Still here → process exists. Check for zombie on platforms # where we have a cheap, deterministic process-state probe. if sys.platform == "linux": try: diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 9583ab148c..b14afae0b3 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -5787,16 +5787,14 @@ def _kill_stale_dashboard_processes( while pending and _time.monotonic() < deadline: _time.sleep(0.1) still_pending = [] + # On Windows, os.kill(pid, 0) is NOT a no-op. Route through + # the cross-platform existence check. + from gateway.status import _pid_exists for pid in pending: - try: - os.kill(pid, 0) # probe - except ProcessLookupError: - killed.append(pid) - except (PermissionError, OSError): - # Can't probe — assume still there. + if _pid_exists(pid): still_pending.append(pid) else: - still_pending.append(pid) + killed.append(pid) pending = still_pending # SIGKILL any survivors. diff --git a/hermes_cli/profiles.py b/hermes_cli/profiles.py index 43d48c26e4..06bd70f7cb 100644 --- a/hermes_cli/profiles.py +++ b/hermes_cli/profiles.py @@ -774,15 +774,13 @@ def _stop_gateway_process(profile_dir: Path) -> None: # and raw os.kill with SIGTERM doesn't cascade to child processes # the same way taskkill /T does. from gateway.status import terminate_pid as _terminate_pid + from gateway.status import _pid_exists _terminate_pid(pid) # graceful first - # Wait up to 10s for graceful shutdown + # Wait up to 10s for graceful shutdown. On Windows, os.kill(pid, 0) + # is NOT a no-op — use the handle-based existence check. for _ in range(20): _time.sleep(0.5) - try: - os.kill(pid, 0) - except (ProcessLookupError, OSError): - # OSError covers Windows' WinError 87 "invalid parameter" - # returned for an invalid/gone PID probe. + if not _pid_exists(pid): print(f"✓ Gateway stopped (PID {pid})") return # Force kill diff --git a/plugins/google_meet/process_manager.py b/plugins/google_meet/process_manager.py index a5da48b83b..e16ff4fa6a 100644 --- a/plugins/google_meet/process_manager.py +++ b/plugins/google_meet/process_manager.py @@ -70,14 +70,11 @@ def _clear_active() -> None: def _pid_alive(pid: int) -> bool: - try: - os.kill(pid, 0) - except ProcessLookupError: - return False - except PermissionError: - # Process exists but we can't signal it — treat as alive. - return True - return True + # ``os.kill(pid, 0)`` is NOT a no-op on Windows (bpo-14484) — it + # routes through GenerateConsoleCtrlEvent and can kill the target. + # Use the cross-platform existence check. + from gateway.status import _pid_exists + return _pid_exists(pid) # --------------------------------------------------------------------------- diff --git a/tools/browser_tool.py b/tools/browser_tool.py index 1d6a732134..ee642db8bd 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -1211,19 +1211,10 @@ def _reap_orphaned_browser_sessions(): if os.path.isfile(owner_pid_file): try: owner_pid = int(Path(owner_pid_file).read_text(encoding="utf-8").strip()) - try: - os.kill(owner_pid, 0) - owner_alive = True - except ProcessLookupError: - owner_alive = False - except PermissionError: - # Owner exists but we can't signal it (different uid). - # Treat as alive — don't reap someone else's session. - owner_alive = True - except OSError: - # Windows: gone PID raises OSError (WinError 87) instead - # of ProcessLookupError. Treat as dead to match POSIX. - owner_alive = False + # ``os.kill(pid, 0)`` is NOT a no-op on Windows (bpo-14484). + # Use the cross-platform existence check. + from gateway.status import _pid_exists + owner_alive = _pid_exists(owner_pid) except (ValueError, OSError): owner_alive = None # corrupt file — fall through @@ -1250,19 +1241,10 @@ def _reap_orphaned_browser_sessions(): shutil.rmtree(socket_dir, ignore_errors=True) continue - # Check if the daemon is still alive - try: - os.kill(daemon_pid, 0) # signal 0 = existence check - except ProcessLookupError: - # Already dead, just clean up the dir - shutil.rmtree(socket_dir, ignore_errors=True) - continue - except PermissionError: - # Alive but owned by someone else — leave it alone - continue - except OSError: - # Windows raises OSError (WinError 87) for a gone PID — treat - # as dead and clean up, mirroring the ProcessLookupError branch. + # Check if the daemon is still alive. ``os.kill(pid, 0)`` on Windows + # is NOT a no-op — use the handle-based existence check. + from gateway.status import _pid_exists + if not _pid_exists(daemon_pid): shutil.rmtree(socket_dir, ignore_errors=True) continue diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py index 8eb469597e..1e10b276f1 100644 --- a/tools/mcp_tool.py +++ b/tools/mcp_tool.py @@ -1251,9 +1251,10 @@ class MCPServerTask: for _pid in new_pids: _stdio_pids.pop(_pid, None) for pid in new_pids: - try: - os.kill(pid, 0) # signal 0: probe liveness only - except (ProcessLookupError, PermissionError, OSError): + # ``os.kill(pid, 0)`` is NOT a no-op on Windows + # (bpo-14484). Use the cross-platform check. + from gateway.status import _pid_exists + if not _pid_exists(pid): continue # process already exited — nothing to do _orphan_stdio_pids.add(pid) @@ -3369,16 +3370,20 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None: # Phase 3: SIGKILL any survivors _sigkill = getattr(_signal, "SIGKILL", _signal.SIGTERM) + # ``os.kill(pid, 0)`` is NOT a no-op on Windows. Use the cross-platform + # existence check before escalating to SIGKILL. + from gateway.status import _pid_exists for pid, server_name in pids.items(): + if not _pid_exists(pid): + continue # Good — exited after SIGTERM try: - os.kill(pid, 0) # Check if still alive os.kill(pid, _sigkill) logger.warning( "Force-killed MCP process %d (%s) after SIGTERM timeout", pid, server_name, ) except (ProcessLookupError, PermissionError, OSError): - pass # Good — exited after SIGTERM + pass def _stop_mcp_loop(): diff --git a/tools/process_registry.py b/tools/process_registry.py index a5fd5c8e51..a145e8c281 100644 --- a/tools/process_registry.py +++ b/tools/process_registry.py @@ -404,15 +404,10 @@ class ProcessRegistry: """Best-effort liveness check for host-visible PIDs.""" if not pid: return False - try: - os.kill(pid, 0) - return True - except (ProcessLookupError, PermissionError, OSError): - # OSError covers Windows' WinError 87 for a gone PID, and the - # ``WinError 5 Access denied`` case — treat both as "can't probe - # or process is gone", which matches the conservative - # "not alive" semantics callers already handle. - return False + # ``os.kill(pid, 0)`` is NOT a no-op on Windows (bpo-14484) — use + # the cross-platform existence check. + from gateway.status import _pid_exists + return _pid_exists(pid) def _refresh_detached_session(self, session: Optional[ProcessSession]) -> Optional[ProcessSession]: """Update recovered host-PID sessions when the underlying process has exited."""