diff --git a/hermes_cli/_subprocess_compat.py b/hermes_cli/_subprocess_compat.py index 4d4ad3f18dc..b6bce13ad1d 100644 --- a/hermes_cli/_subprocess_compat.py +++ b/hermes_cli/_subprocess_compat.py @@ -97,6 +97,16 @@ def resolve_node_command(name: str, argv: Sequence[str]) -> list[str]: _CREATE_NEW_PROCESS_GROUP = 0x00000200 _DETACHED_PROCESS = 0x00000008 _CREATE_NO_WINDOW = 0x08000000 +# Escape any Win32 job object the parent process belongs to. Without this, +# a detached child still inherits its parent's job object membership, and +# when that parent (Electron, Tauri, Windows Terminal, the Desktop GUI's +# bootstrap-installer) dies, the OS tears down the whole job — taking the +# "detached" child with it. Critical for the post-update gateway watcher: +# Electron spawns the Tauri updater inside its own job, the updater spawns +# the watcher subprocess; without BREAKAWAY the watcher dies the instant +# Electron exits, so the gateway never gets respawned after a `hermes +# update` triggered from the GUI. See fix/windows-gateway-reliability. +_CREATE_BREAKAWAY_FROM_JOB = 0x01000000 def windows_detach_flags() -> int: @@ -116,10 +126,30 @@ def windows_detach_flags() -> int: - ``CREATE_NO_WINDOW`` — suppress the brief cmd flash that would otherwise appear when launching a console app. Redundant with DETACHED_PROCESS but explicit for clarity. + - ``CREATE_BREAKAWAY_FROM_JOB`` — escape any job object the parent is + in. Electron (Desktop app) and Tauri (bootstrap installer) wrap + their children in job objects; without breakaway, those children + die when the parent process exits even if they were spawned with + DETACHED_PROCESS. This was the missing flag that made the + post-update gateway respawn watcher silently die alongside the + Tauri updater after the Electron Desktop's update flow finished. + + If a process is in a job that disallows breakaway (rare — + JOB_OBJECT_LIMIT_BREAKAWAY_OK isn't set), CreateProcess returns + ERROR_ACCESS_DENIED. Python surfaces that as ``PermissionError`` + on the ``subprocess.Popen`` call. Callers in this codebase already + wrap detached spawns in ``try/except OSError`` and fall back to a + cmd.exe wrapper, so the breakaway-denied case degrades gracefully + rather than crashing. """ if not IS_WINDOWS: return 0 - return _CREATE_NEW_PROCESS_GROUP | _DETACHED_PROCESS | _CREATE_NO_WINDOW + return ( + _CREATE_NEW_PROCESS_GROUP + | _DETACHED_PROCESS + | _CREATE_NO_WINDOW + | _CREATE_BREAKAWAY_FROM_JOB + ) def windows_hide_flags() -> int: diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index a148f0b24cd..03228004053 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -664,6 +664,10 @@ def launch_detached_profile_gateway_restart(profile: str, old_pid: int) -> bool: # Platform-appropriate detach for the respawned gateway. On POSIX # start_new_session=True maps to os.setsid; on Windows we need # explicit creationflags because start_new_session is a no-op there. + # CREATE_BREAKAWAY_FROM_JOB is critical: the watcher itself may have + # been spawned inside a job object (Electron/Tauri parent), and + # without breakaway the respawned gateway would die when that job + # tears down. See _subprocess_compat.windows_detach_flags(). _popen_kwargs = { "stdout": subprocess.DEVNULL, "stderr": subprocess.DEVNULL, @@ -672,8 +676,12 @@ def launch_detached_profile_gateway_restart(profile: str, old_pid: int) -> bool: _CREATE_NEW_PROCESS_GROUP = 0x00000200 _DETACHED_PROCESS = 0x00000008 _CREATE_NO_WINDOW = 0x08000000 + _CREATE_BREAKAWAY_FROM_JOB = 0x01000000 _popen_kwargs["creationflags"] = ( - _CREATE_NEW_PROCESS_GROUP | _DETACHED_PROCESS | _CREATE_NO_WINDOW + _CREATE_NEW_PROCESS_GROUP + | _DETACHED_PROCESS + | _CREATE_NO_WINDOW + | _CREATE_BREAKAWAY_FROM_JOB ) else: _popen_kwargs["start_new_session"] = True diff --git a/hermes_cli/gateway_windows.py b/hermes_cli/gateway_windows.py index 64b5fd6dc29..08c7d8c019c 100644 --- a/hermes_cli/gateway_windows.py +++ b/hermes_cli/gateway_windows.py @@ -380,13 +380,26 @@ def _build_gateway_cmd_script( def _build_startup_launcher(script_path: Path) -> str: - """The tiny .cmd that goes in the Startup folder. Just minimizes and chains.""" + """The tiny .cmd that goes in the Startup folder. Just minimizes and chains. + + Defense-in-depth: bail out silently if the target script is gone. Test + fixtures historically wrote Startup entries pointing at pytest tmp_path + directories that vanish after the test session. Without the existence + guard, every subsequent Windows login flashes a cmd.exe window that + fails to find the target. The check + ``exit /b 0`` keeps that case + silent. + """ + quoted_target = _quote_cmd_script_arg(str(script_path)) lines = [ "@echo off", f"rem {_TASK_DESCRIPTION}", + # If the wrapper script is gone (typical for stale entries from + # uninstalled/migrated installs), silently no-op instead of + # flashing a cmd window with a "file not found" error. + f"if not exist {quoted_target} exit /b 0", # ``start "" /min`` detaches with a minimized console window. # ``/d /c`` on cmd.exe skips AUTORUN and runs the target script once. - f'start "" /min cmd.exe /d /c {_quote_cmd_script_arg(str(script_path))}', + f'start "" /min cmd.exe /d /c {quoted_target}', ] return "\r\n".join(lines) + "\r\n" @@ -478,6 +491,8 @@ def _install_scheduled_task(task_name: str, script_path: Path) -> tuple[bool, st return (False, f"schtasks /Create failed (code {last_code}): {last_err.strip()}") + + def _install_startup_entry(script_path: Path) -> Path: """Write the Startup-folder fallback launcher. Returns its path.""" entry = get_startup_entry_path() @@ -926,7 +941,10 @@ def uninstall() -> None: else: print(f"⚠ schtasks /Delete returned code {code}: {detail}") - for path, label in [(startup_entry, "Windows login item"), (script_path, "Task script")]: + for path, label in [ + (startup_entry, "Windows login item"), + (script_path, "Task script"), + ]: try: path.unlink() print(f"✓ Removed {label}: {path}") @@ -984,6 +1002,139 @@ def _gateway_pids() -> list[int]: return list(find_gateway_pids()) +def _print_deep_probes() -> None: + """Print PASS/FAIL per individual probe of gateway liveness. + + The default ``status`` output collapses several signals into one + ✓ / ✗ line, which is great when they agree and confusing when they + don't. The deep-probe block shows each underlying check independently + so the user can see exactly which signal is wrong. + + Probes: + [1] PID file present + [2] Lock file present and held by some process + [3] gateway.status.get_running_pid() returns a PID + [4] _pid_exists(pid) — OS confirms the process is alive + [5] gateway_state.json exists and parses (and is fresh-ish) + [6] Last lifecycle event in gateway-exit-diag.log + """ + import json + from datetime import datetime, timezone + + from hermes_cli.config import get_hermes_home + + home = Path(get_hermes_home()).resolve() + pid_path = home / "gateway.pid" + lock_path = home / "gateway.lock" + state_path = home / "gateway_state.json" + diag_path = home / "logs" / "gateway-exit-diag.log" + + print() + print("Deep probes:") + + def _mark(ok: bool) -> str: + return "PASS" if ok else "FAIL" + + # [1] PID file + pid_exists = pid_path.exists() + pid_value: int | None = None + if pid_exists: + try: + data = json.loads(pid_path.read_text(encoding="utf-8")) + pid_value = int(data.get("pid")) if data.get("pid") is not None else None + print(f" [1] {_mark(True):4s} PID file present: {pid_path} (pid={pid_value})") + except Exception as exc: + print(f" [1] {_mark(False):4s} PID file present but unreadable: {exc}") + else: + print(f" [1] {_mark(False):4s} PID file missing: {pid_path}") + + # [2] Lock file present + held + lock_held = False + lock_present = lock_path.exists() + if lock_present: + try: + from gateway.status import is_gateway_runtime_lock_active + + lock_held = is_gateway_runtime_lock_active(lock_path) + print(f" [2] {_mark(lock_held):4s} Lock file held by a live process: {lock_path}") + except Exception as exc: + print(f" [2] {_mark(False):4s} Could not probe lock: {exc}") + else: + print(f" [2] {_mark(False):4s} Lock file missing: {lock_path}") + + # [3] get_running_pid() + running_pid: int | None = None + try: + from gateway.status import get_running_pid + + running_pid = get_running_pid(cleanup_stale=False) + print(f" [3] {_mark(running_pid is not None):4s} get_running_pid() => {running_pid}") + except Exception as exc: + print(f" [3] {_mark(False):4s} get_running_pid() raised: {exc!r}") + + # [4] _pid_exists() on the probed PID + candidate_pid = running_pid if running_pid is not None else pid_value + if candidate_pid is not None: + try: + from gateway.status import _pid_exists + + alive = bool(_pid_exists(candidate_pid)) + print(f" [4] {_mark(alive):4s} _pid_exists({candidate_pid}) => {alive}") + except Exception as exc: + print(f" [4] {_mark(False):4s} _pid_exists raised: {exc!r}") + else: + print(f" [4] {_mark(False):4s} No candidate PID to verify") + + # [5] runtime status file + if state_path.exists(): + try: + state_data = json.loads(state_path.read_text(encoding="utf-8")) + gateway_state = state_data.get("gateway_state") + updated_at = state_data.get("updated_at") + age_str = "" + if updated_at: + try: + updated_dt = datetime.fromisoformat(updated_at.replace("Z", "+00:00")) + now = datetime.now(timezone.utc) + age_seconds = int((now - updated_dt).total_seconds()) + age_str = f" (updated {age_seconds}s ago)" + except Exception: + pass + ok = gateway_state == "running" + print(f" [5] {_mark(ok):4s} gateway_state.json state={gateway_state!r}{age_str}") + except Exception as exc: + print(f" [5] {_mark(False):4s} gateway_state.json present but unreadable: {exc}") + else: + print(f" [5] {_mark(False):4s} gateway_state.json missing: {state_path}") + + # [6] Last lifecycle event from the exit-diag log + if diag_path.exists(): + try: + with open(diag_path, "rb") as fh: + # Read last ~4KB; one event is well under 500 bytes. + fh.seek(0, 2) + size = fh.tell() + fh.seek(max(0, size - 4096)) + tail = fh.read().decode("utf-8", errors="replace").splitlines() + last_event = next((ln for ln in reversed(tail) if ln.strip()), "") + if last_event: + try: + event = json.loads(last_event) + tag = event.get("tag", "?") + pid = event.get("pid", "?") + ts = event.get("ts", "?") + healthy = tag in ("gateway.start",) + print(f" [6] {_mark(healthy):4s} Last lifecycle event: tag={tag} pid={pid} ts={ts}") + except Exception: + print(f" [6] {_mark(False):4s} Last lifecycle line not JSON: {last_event[:120]}") + else: + print(f" [6] {_mark(False):4s} exit-diag log empty: {diag_path}") + except Exception as exc: + print(f" [6] {_mark(False):4s} exit-diag log unreadable: {exc}") + else: + print(f" [6] {_mark(False):4s} exit-diag log missing: {diag_path}") + + def status(deep: bool = False) -> None: """Print a status report for the Windows gateway service.""" _assert_windows() @@ -1011,9 +1162,12 @@ def status(deep: bool = False) -> None: if deep: print() - print(f" Task name: {task_name}") - print(f" Task script: {get_task_script_path()}") - print(f" Startup entry: {get_startup_entry_path()}") + print(f" Task name: {task_name}") + print(f" Task script: {get_task_script_path()}") + print(f" Startup entry: {get_startup_entry_path()}") + # Surface the per-probe truth so the user can see *which* signal + # is lying when the high-level summary disagrees with reality. + _print_deep_probes() if not task_installed and not startup_installed and not pids: print() diff --git a/tests/hermes_cli/test_gateway_wsl.py b/tests/hermes_cli/test_gateway_wsl.py index d12391a9fb0..57007311f01 100644 --- a/tests/hermes_cli/test_gateway_wsl.py +++ b/tests/hermes_cli/test_gateway_wsl.py @@ -167,6 +167,14 @@ class TestGatewayCommandWSLMessages: monkeypatch.setattr(gateway, "supports_systemd_services", lambda: False) monkeypatch.setattr(gateway, "is_macos", lambda: False) monkeypatch.setattr(gateway, "is_managed", lambda: False) + # CRITICAL: also stub is_windows. Without this, running this test on a + # real Windows host falls through to the is_windows() branch *before* + # the WSL guidance branch, invoking gateway_windows.install() which + # writes a Startup-folder .cmd into the real user's Startup folder + # (NOT tmp_path) pointing at a now-vanished pytest fixture path. + # The user then sees a broken Hermes_Gateway.cmd flash a cmd.exe + # window on every login. See fix/windows-gateway-reliability. + monkeypatch.setattr(gateway, "is_windows", lambda: False) args = SimpleNamespace( gateway_command="install", force=False, system=False, @@ -189,6 +197,10 @@ class TestGatewayCommandWSLMessages: monkeypatch.setattr(gateway, "is_wsl", lambda: True) monkeypatch.setattr(gateway, "supports_systemd_services", lambda: False) monkeypatch.setattr(gateway, "is_macos", lambda: False) + # See test_install_wsl_no_systemd: stub is_windows so a Windows host + # running this test does NOT actually spawn a detached gateway via + # gateway_windows.start(). + monkeypatch.setattr(gateway, "is_windows", lambda: False) args = SimpleNamespace(gateway_command="start", system=False) with pytest.raises(SystemExit) as exc_info: @@ -206,6 +218,9 @@ class TestGatewayCommandWSLMessages: monkeypatch.setattr(gateway, "is_macos", lambda: False) monkeypatch.setattr(gateway, "is_termux", lambda: False) monkeypatch.setattr(gateway, "is_wsl", lambda: True) + # Stub is_windows so a Windows host running this test does NOT take + # the Windows status branch (which reads gateway_windows.is_installed()). + monkeypatch.setattr(gateway, "is_windows", lambda: False) monkeypatch.setattr(gateway, "find_gateway_pids", lambda: [12345]) monkeypatch.setattr(gateway, "_runtime_health_lines", lambda: []) # Stub out the systemd unit path check @@ -231,6 +246,8 @@ class TestGatewayCommandWSLMessages: monkeypatch.setattr(gateway, "is_macos", lambda: False) monkeypatch.setattr(gateway, "is_termux", lambda: False) monkeypatch.setattr(gateway, "is_wsl", lambda: True) + # See test_status_wsl_running_manual. + monkeypatch.setattr(gateway, "is_windows", lambda: False) monkeypatch.setattr(gateway, "find_gateway_pids", lambda: []) monkeypatch.setattr(gateway, "_runtime_health_lines", lambda: []) monkeypatch.setattr(