fix(update): stop Windows gateways before mutating install

This commit is contained in:
helix4u 2026-06-13 01:08:43 -06:00 committed by Teknium
parent 957a8ffa88
commit 78c11d99e3
2 changed files with 284 additions and 1 deletions

View file

@ -8030,6 +8030,182 @@ def _run_pre_update_backup(args) -> None:
print()
def _write_update_planned_stop_marker(profile_path: Path, pid: int) -> bool:
"""Write a planned-stop marker into a specific profile home."""
try:
from datetime import timezone
from gateway.status import _get_process_start_time
from utils import atomic_json_write
record = {
"target_pid": pid,
"target_start_time": _get_process_start_time(pid),
"stopper_pid": os.getpid(),
"written_at": datetime.now(timezone.utc).isoformat(),
}
atomic_json_write(
Path(profile_path) / ".gateway-planned-stop.json",
record,
indent=None,
separators=(",", ":"),
)
return True
except (OSError, PermissionError):
return False
def _wait_for_windows_update_gateway_exit(
pids: list[int], *, timeout: float
) -> set[int]:
"""Wait for the given gateway PIDs to exit, returning survivors."""
if not pids:
return set()
from gateway.status import _pid_exists
remaining = set(pids)
deadline = _time.monotonic() + max(timeout, 0.0)
while remaining and _time.monotonic() < deadline:
for pid in list(remaining):
try:
if not _pid_exists(pid):
remaining.discard(pid)
except Exception:
remaining.discard(pid)
if remaining:
_time.sleep(0.25)
survivors: set[int] = set()
for pid in remaining:
try:
if _pid_exists(pid):
survivors.add(pid)
except Exception:
pass
return survivors
def _pause_windows_gateways_for_update() -> dict | None:
"""Stop running Windows gateways before mutating the checkout or venv.
Windows scheduled/startup gateways run through pythonw.exe, so the generic
hermes.exe concurrent-instance guard does not see them. They still import
from the checkout and can keep files locked while ``git`` or ``uv`` updates
the install. Stop only PIDs that the gateway discovery code identifies.
"""
if not _is_windows():
return None
try:
from gateway.status import terminate_pid
from hermes_cli.gateway import (
_get_restart_drain_timeout,
find_gateway_pids,
find_profile_gateway_processes,
)
except Exception as exc:
logger.debug("Could not prepare Windows gateway pause for update: %s", exc)
return None
try:
running_pids = list(dict.fromkeys(find_gateway_pids(all_profiles=True)))
except Exception as exc:
logger.debug("Could not discover Windows gateway PIDs before update: %s", exc)
return None
if not running_pids:
return None
profile_processes = {}
try:
profile_processes = {
proc.pid: proc for proc in find_profile_gateway_processes()
}
except Exception as exc:
logger.debug("Could not map Windows gateway PIDs to profiles: %s", exc)
profiles: dict[str, int] = {}
mapped_pids = []
for pid in running_pids:
proc = profile_processes.get(pid)
if proc is None:
continue
profiles[str(proc.profile)] = int(pid)
mapped_pids.append(int(pid))
_write_update_planned_stop_marker(Path(proc.path), int(pid))
print("→ Stopping Windows gateway process(es) before updating Hermes...")
try:
drain_timeout = max(float(_get_restart_drain_timeout()), 1.0)
except Exception:
drain_timeout = 10.0
survivors = _wait_for_windows_update_gateway_exit(
mapped_pids,
timeout=drain_timeout,
)
unmapped_pids = [pid for pid in running_pids if pid not in profile_processes]
force_killed = []
for pid in sorted(set(survivors).union(unmapped_pids)):
try:
terminate_pid(int(pid), force=True)
force_killed.append(int(pid))
except (ProcessLookupError, PermissionError, OSError):
pass
if profiles:
print(f" ✓ Paused gateway profile(s): {', '.join(sorted(profiles))}")
if force_killed:
print(f" → Force-stopped {len(force_killed)} gateway process(es)")
if unmapped_pids:
print(
f" → Stopped {len(unmapped_pids)} gateway process(es) without profile mapping"
)
print(" Restart manually after update: hermes gateway run")
return {
"resume_needed": True,
"profiles": profiles,
"unmapped_pids": unmapped_pids,
}
def _resume_windows_gateways_after_update(token: dict | None) -> None:
"""Restart Windows profile gateways previously paused for update."""
if not token or not token.get("resume_needed"):
return
token["resume_needed"] = False
if not _is_windows():
return
profiles = token.get("profiles") or {}
if not profiles:
return
try:
from hermes_cli.gateway import launch_detached_profile_gateway_restart
except Exception as exc:
logger.debug("Could not load Windows gateway restart helper: %s", exc)
return
relaunched = []
for profile, old_pid in sorted(profiles.items()):
try:
if launch_detached_profile_gateway_restart(str(profile), int(old_pid)):
relaunched.append(str(profile))
except Exception as exc:
logger.debug(
"Could not restart Windows gateway profile %s after update: %s",
profile,
exc,
)
if relaunched:
print()
print(f" ✓ Restarting Windows gateway profile(s): {', '.join(relaunched)}")
def _discard_lockfile_churn(git_cmd, repo_root):
"""Restore tracked ``package-lock.json`` files that npm dirtied locally.
@ -8232,6 +8408,15 @@ def _cmd_update_impl(args, gateway_mode: bool):
# always roll back to the exact state they had before this update.
_run_pre_update_backup(args)
_windows_gateway_resume = _pause_windows_gateways_for_update()
if _windows_gateway_resume:
import atexit as _atexit
_atexit.register(
_resume_windows_gateways_after_update,
_windows_gateway_resume,
)
# Try git-based update first, fall back to ZIP download on Windows
# when git file I/O is broken (antivirus, NTFS filter drivers, etc.)
use_zip_update = False
@ -8294,7 +8479,10 @@ def _cmd_update_impl(args, gateway_mode: bool):
if use_zip_update:
# ZIP-based update for Windows when git is broken
_update_via_zip(args)
try:
_update_via_zip(args)
finally:
_resume_windows_gateways_after_update(_windows_gateway_resume)
return
# Fetch and pull
@ -8431,6 +8619,7 @@ def _cmd_update_impl(args, gateway_mode: bool):
check=False,
)
print("✓ Already up to date!")
_resume_windows_gateways_after_update(_windows_gateway_resume)
return
print(f"→ Found {commit_count} new commit(s)")
@ -9627,6 +9816,8 @@ def _cmd_update_impl(args, gateway_mode: bool):
except Exception as e:
logger.debug("Gateway restart during update failed: %s", e)
_resume_windows_gateways_after_update(_windows_gateway_resume)
# Warn if legacy Hermes gateway unit files are still installed.
# When both hermes.service (from a pre-rename install) and the
# current hermes-gateway.service are enabled, they SIGTERM-fight

View file

@ -7,6 +7,7 @@ Windows-specific code paths can be exercised on any host.
from __future__ import annotations
import json
import os
import sys
import types
@ -446,6 +447,97 @@ def test_quarantine_actionable_warning_when_everything_fails(
assert "Hermes Desktop" in captured or "gateway" in captured.lower()
# ---------------------------------------------------------------------------
# Windows gateway pause/resume before update mutation
# ---------------------------------------------------------------------------
@patch.object(cli_main, "_is_windows", return_value=True)
def test_pause_windows_gateways_for_update_stops_profile_and_unmapped_pids(
_winp,
monkeypatch,
tmp_path,
capsys,
):
import gateway.status as status_mod
import hermes_cli.gateway as gateway_mod
profile_home = tmp_path / "profiles" / "work"
profile_home.mkdir(parents=True)
profile_proc = SimpleNamespace(profile="work", path=profile_home, pid=101)
monkeypatch.setattr(gateway_mod, "find_gateway_pids", lambda **_k: [101, 202])
monkeypatch.setattr(
gateway_mod,
"find_profile_gateway_processes",
lambda **_k: [profile_proc],
)
monkeypatch.setattr(gateway_mod, "_get_restart_drain_timeout", lambda: 0.1)
waited_for = []
def fake_wait(pids, *, timeout):
waited_for.extend(pids)
return set()
monkeypatch.setattr(cli_main, "_wait_for_windows_update_gateway_exit", fake_wait)
terminated = []
monkeypatch.setattr(
status_mod,
"terminate_pid",
lambda pid, force=False: terminated.append((pid, force)),
)
token = cli_main._pause_windows_gateways_for_update()
assert token == {
"resume_needed": True,
"profiles": {"work": 101},
"unmapped_pids": [202],
}
assert waited_for == [101]
assert terminated == [(202, True)]
marker = json.loads((profile_home / ".gateway-planned-stop.json").read_text())
assert marker["target_pid"] == 101
assert marker["stopper_pid"] == os.getpid()
captured = capsys.readouterr().out
assert "Paused gateway profile(s): work" in captured
assert "without profile mapping" in captured
@patch.object(cli_main, "_is_windows", return_value=True)
def test_resume_windows_gateways_after_update_relaunches_paused_profiles(
_winp,
monkeypatch,
capsys,
):
import hermes_cli.gateway as gateway_mod
relaunched = []
monkeypatch.setattr(
gateway_mod,
"launch_detached_profile_gateway_restart",
lambda profile, old_pid: relaunched.append((profile, old_pid)) or True,
)
token = {
"resume_needed": True,
"profiles": {"default": 101, "work": 202},
"unmapped_pids": [],
}
cli_main._resume_windows_gateways_after_update(token)
assert token["resume_needed"] is False
assert relaunched == [("default", 101), ("work", 202)]
assert (
"Restarting Windows gateway profile(s): default, work"
in capsys.readouterr().out
)
# ---------------------------------------------------------------------------
# cmd_update integration — concurrent-instance gate
# ---------------------------------------------------------------------------