mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-14 04:02:26 +00:00
fix(windows): os.kill(pid, 0) is NOT a no-op on Windows — route through new _pid_exists helper
On Windows, Python's ``os.kill(pid, 0)`` is NOT a no-op. CPython's
implementation (``Modules/posixmodule.c::os_kill_impl``) treats sig=0
as ``CTRL_C_EVENT`` because the two integer values collide at the C
layer, and routes it through ``GenerateConsoleCtrlEvent(0, pid)`` —
which sends a Ctrl+C to the ENTIRE console process group containing
the target PID, not just the PID itself. Any caller that wanted to
check "is PID X alive" via the classic POSIX ``os.kill(pid, 0)``
idiom was silently killing that process (and often unrelated
processes in the same console group) on Windows. Long-standing
Python Windows quirk; see bpo-14484 (open since 2012).
This manifested in Hermes as: every ``hermes gateway status``
invocation would read the gateway's PID from the PID file, call
``os.kill(pid, 0)`` via ``gateway.status.get_running_pid()`` as a
"liveness check", and instantly terminate the gateway it was trying
to report on. No shutdown log, no traceback, no atexit hook fire,
no exit-diag entry — just silent termination of the detached pythonw
process. "Bot answered one message then stopped typing" was the
characteristic end-user symptom because `os.kill(pid, 0)` fires
mid-response-send and kills the gateway between logs.
Reproduction (verified in this branch before the fix):
$ hermes gateway start # gateway alive, PID 37520
$ hermes gateway status # reports "No gateway process detected"
$ tasklist /FI "PID eq 37520" # INFO: No tasks are running
# — gateway terminated silently
Root-cause fix is a new ``gateway.status._pid_exists(pid)`` helper:
- On Windows: Win32 ``OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION |
SYNCHRONIZE, False, pid)`` + ``WaitForSingleObject(handle, 0)``
via ctypes. Zero signal delivery, zero console-group side effects.
Pins ctypes return types to avoid DWORD-vs-signed-int parse bugs
on WAIT_TIMEOUT (0x102). Distinguishes ERROR_INVALID_PARAMETER
(PID gone) from ERROR_ACCESS_DENIED (alive but another user).
- On POSIX: the canonical ``os.kill(pid, 0)`` idiom that actually is
a no-op there.
Then patch every ``os.kill(pid, 0)`` liveness-check callsite to
route through ``_pid_exists`` instead. Total 14 callsites across
11 files; every single one was a latent silent-kill on Windows:
gateway/run.py:2810 — /restart watcher (inline subprocess)
gateway/run.py:15195 — --replace wait loop
gateway/status.py:572 — acquire_gateway_runtime_lock stale check
gateway/status.py:828 — get_running_pid (THE killer for status)
gateway/platforms/whatsapp.py:111
hermes_cli/gateway.py:228, 522, 1012 — gateway-related drain loops
hermes_cli/kanban_db.py:2826 — _pid_alive was claiming to
be cross-platform but used
os.kill(pid, 0) on Windows
hermes_cli/main.py:5792 — CLI process-kill polling
hermes_cli/profiles.py:782 — profile stop wait loop
plugins/google_meet/process_manager.py:74
tools/browser_tool.py:1215, 1255 — browser daemon ownership probes
tools/mcp_tool.py:1255, 3374 — MCP stdio orphan tracking
The watcher source in gateway/run.py:2810 is a multi-line string
that gets spawned as an inline ``python -c "..."`` subprocess, so
it can't import gateway.status. The fix for that callsite inlines
the same ctypes probe directly into the watcher source.
Tested on Windows 10 with the hermes gateway + Telegram bot:
- gateway start → alive
- 5 consecutive ``hermes gateway status`` invocations → gateway
alive after every one, same PID reported each time (37520, 21952)
- gateway.log shows uninterrupted operation; no spurious shutdown
entries; cron ticker and kanban dispatcher still running on
their 60-second cadence
- bot continues answering Telegram messages throughout
Ships alongside an exit-path diagnostic wrapper in
``hermes_cli/gateway.py::run_gateway()`` that captures every way
``asyncio.run(start_gateway(...))`` can return (success, SystemExit,
KeyboardInterrupt, BaseException, atexit) with full traceback to
``logs/gateway-exit-diag.log``. This was used to prove the gateway
was being hard-killed externally (no exit event fired) and should
be kept for future Windows debugging.
Refs: https://bugs.python.org/issue14484
See also: references/windows-subprocess-sigint-storm.md in
the hermes-agent skill.
This commit is contained in:
parent
ac178b78c4
commit
1cbe399149
11 changed files with 296 additions and 151 deletions
|
|
@ -107,12 +107,15 @@ def _kill_stale_bridge_by_pidfile(session_path: Path) -> None:
|
|||
except OSError:
|
||||
pass
|
||||
return
|
||||
try:
|
||||
os.kill(pid, 0) # check existence
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
logger.info("[whatsapp] Killed stale bridge PID %d from pidfile", pid)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
pass
|
||||
# ``os.kill(pid, 0)`` is NOT a no-op on Windows (bpo-14484) — use the
|
||||
# cross-platform existence check before sending a real signal.
|
||||
from gateway.status import _pid_exists
|
||||
if _pid_exists(pid):
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
logger.info("[whatsapp] Killed stale bridge PID %d from pidfile", pid)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
pass
|
||||
try:
|
||||
pid_file.unlink()
|
||||
except OSError:
|
||||
|
|
|
|||
|
|
@ -2805,10 +2805,36 @@ class GatewayRunner:
|
|||
pid = int(sys.argv[1])
|
||||
cmd = sys.argv[2:]
|
||||
deadline = time.monotonic() + 120
|
||||
while time.monotonic() < deadline:
|
||||
|
||||
def _alive(p):
|
||||
# On Windows, os.kill(pid, 0) is NOT a no-op — it maps to
|
||||
# GenerateConsoleCtrlEvent(0, pid) (bpo-14484). Use the
|
||||
# Win32 handle-based existence check instead.
|
||||
if os.name == 'nt':
|
||||
import ctypes
|
||||
k32 = ctypes.windll.kernel32
|
||||
k32.OpenProcess.restype = ctypes.c_void_p
|
||||
k32.WaitForSingleObject.restype = ctypes.c_uint
|
||||
k32.GetLastError.restype = ctypes.c_uint
|
||||
h = k32.OpenProcess(0x1000 | 0x100000, False, int(p))
|
||||
if not h:
|
||||
return k32.GetLastError() != 87
|
||||
try:
|
||||
return k32.WaitForSingleObject(h, 0) == 0x102
|
||||
finally:
|
||||
k32.CloseHandle(h)
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
os.kill(int(p), 0)
|
||||
return True
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
except PermissionError:
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
while time.monotonic() < deadline:
|
||||
if not _alive(pid):
|
||||
break
|
||||
time.sleep(0.2)
|
||||
_CREATE_NEW_PROCESS_GROUP = 0x00000200
|
||||
|
|
@ -15189,16 +15215,14 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
|||
except Exception:
|
||||
pass
|
||||
return False
|
||||
# Wait up to 10 seconds for the old process to exit
|
||||
# Wait up to 10 seconds for the old process to exit.
|
||||
# ``os.kill(pid, 0)`` on Windows is NOT a no-op — use the
|
||||
# handle-based existence check instead.
|
||||
from gateway.status import _pid_exists
|
||||
for _ in range(20):
|
||||
try:
|
||||
os.kill(existing_pid, 0)
|
||||
time.sleep(0.5)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
# OSError covers Windows' WinError 87 "invalid parameter"
|
||||
# for an already-gone PID — without this the probe loop
|
||||
# busy-spins for the full 10s on every --replace start.
|
||||
if not _pid_exists(existing_pid):
|
||||
break # Process is gone
|
||||
time.sleep(0.5)
|
||||
else:
|
||||
# Still alive after 10s — force kill
|
||||
logger.warning(
|
||||
|
|
|
|||
|
|
@ -299,6 +299,71 @@ def _try_acquire_file_lock(handle) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
def _pid_exists(pid: int) -> bool:
|
||||
"""Cross-platform "is this PID alive" check that does NOT kill the target.
|
||||
|
||||
CRITICAL on Windows: Python's ``os.kill(pid, 0)`` is NOT a no-op like it
|
||||
is on POSIX. CPython's Windows implementation
|
||||
(``Modules/posixmodule.c::os_kill_impl``) treats ``sig=0`` as
|
||||
``CTRL_C_EVENT`` because the two values collide at the C level, and
|
||||
routes it through ``GenerateConsoleCtrlEvent(0, pid)`` — which sends
|
||||
a Ctrl+C to the entire console process group containing the target
|
||||
PID, not just the PID itself. Any caller that wanted to "check if
|
||||
this PID is alive" via ``os.kill(pid, 0)`` on Windows was silently
|
||||
killing that process (and often unrelated processes in the same
|
||||
console group). Long-standing Python quirk; see bpo-14484.
|
||||
|
||||
Fix: use the Win32 ``OpenProcess`` / ``WaitForSingleObject`` pair on
|
||||
Windows to check existence without any signal path; use the POSIX
|
||||
``os.kill(pid, 0)`` idiom on POSIX where it actually is a no-op.
|
||||
"""
|
||||
if _IS_WINDOWS:
|
||||
try:
|
||||
import ctypes
|
||||
kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined]
|
||||
# Pin return types — default ctypes restype is c_int (signed),
|
||||
# which mangles WAIT_* DWORD return codes into negative numbers.
|
||||
kernel32.OpenProcess.restype = ctypes.c_void_p
|
||||
kernel32.WaitForSingleObject.restype = ctypes.c_uint
|
||||
kernel32.GetLastError.restype = ctypes.c_uint
|
||||
PROCESS_QUERY_LIMITED_INFORMATION = 0x1000
|
||||
SYNCHRONIZE = 0x100000 # required for WaitForSingleObject
|
||||
WAIT_TIMEOUT = 0x00000102
|
||||
ERROR_INVALID_PARAMETER = 87
|
||||
ERROR_ACCESS_DENIED = 5
|
||||
handle = kernel32.OpenProcess(
|
||||
PROCESS_QUERY_LIMITED_INFORMATION | SYNCHRONIZE, False, int(pid)
|
||||
)
|
||||
if not handle:
|
||||
err = kernel32.GetLastError()
|
||||
if err == ERROR_INVALID_PARAMETER:
|
||||
return False # PID definitely gone
|
||||
if err == ERROR_ACCESS_DENIED:
|
||||
return True # Exists but owned by another user/session
|
||||
return False # Conservative default for unknown errors
|
||||
try:
|
||||
wait_result = kernel32.WaitForSingleObject(handle, 0)
|
||||
# WAIT_TIMEOUT = still running; anything else (WAIT_OBJECT_0
|
||||
# via exit, WAIT_FAILED via handle issue) = treat as gone.
|
||||
return wait_result == WAIT_TIMEOUT
|
||||
finally:
|
||||
kernel32.CloseHandle(handle)
|
||||
except (OSError, AttributeError):
|
||||
return False
|
||||
else:
|
||||
try:
|
||||
os.kill(int(pid), 0)
|
||||
return True
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
except PermissionError:
|
||||
# Process exists but we can't signal it — still alive.
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
|
||||
def _release_file_lock(handle) -> None:
|
||||
try:
|
||||
if _IS_WINDOWS:
|
||||
|
|
@ -503,10 +568,7 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str,
|
|||
|
||||
stale = existing_pid is None
|
||||
if not stale:
|
||||
try:
|
||||
os.kill(existing_pid, 0)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
# Windows raises OSError with WinError 87 for invalid pid check
|
||||
if not _pid_exists(existing_pid):
|
||||
stale = True
|
||||
else:
|
||||
current_start = _get_process_start_time(existing_pid)
|
||||
|
|
@ -517,7 +579,7 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str,
|
|||
):
|
||||
stale = True
|
||||
# Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped
|
||||
# processes still respond to os.kill(pid, 0) but are not
|
||||
# processes still appear alive to _pid_exists but are not
|
||||
# actually running. Treat them as stale so --replace works.
|
||||
if not stale:
|
||||
try:
|
||||
|
|
@ -824,20 +886,7 @@ def get_running_pid(
|
|||
if pid is None:
|
||||
continue
|
||||
|
||||
try:
|
||||
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
|
||||
except ProcessLookupError:
|
||||
continue
|
||||
except PermissionError:
|
||||
# The process exists but belongs to another user/service scope.
|
||||
# With the runtime lock still held, prefer keeping it visible
|
||||
# rather than deleting the PID file as "stale".
|
||||
if _record_looks_like_gateway(record):
|
||||
return pid
|
||||
continue
|
||||
except OSError:
|
||||
# Windows raises OSError with WinError 87 for an invalid pid
|
||||
# (process is definitely gone). Treat as "process doesn't exist".
|
||||
if not _pid_exists(pid):
|
||||
continue
|
||||
|
||||
recorded_start = record.get("start_time")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue