feat(gateway): restart manual profile gateways after update

This commit is contained in:
Michael Nguyen 2026-04-30 21:14:53 +07:00 committed by Teknium
parent 84324d06b8
commit 77fe7ab6b2
4 changed files with 150 additions and 8 deletions

View file

@ -10,6 +10,7 @@ import shutil
import signal
import subprocess
import sys
import textwrap
from dataclasses import dataclass
from pathlib import Path
@ -59,6 +60,13 @@ class GatewayRuntimeSnapshot:
def has_process_service_mismatch(self) -> bool:
return self.service_installed and self.running and not self.service_running
@dataclass(frozen=True)
class ProfileGatewayProcess:
profile: str
path: Path
pid: int
def _get_service_pids() -> set:
"""Return PIDs currently managed by systemd or launchd gateway services.
@ -371,6 +379,83 @@ def find_gateway_pids(exclude_pids: set | None = None, all_profiles: bool = Fals
return pids
def find_profile_gateway_processes(
exclude_pids: set | None = None,
) -> list[ProfileGatewayProcess]:
"""Return running gateway PIDs mapped to Hermes profiles via PID files."""
_exclude = set(exclude_pids or set())
processes: list[ProfileGatewayProcess] = []
try:
from gateway.status import get_running_pid
from hermes_cli.profiles import list_profiles
except Exception:
return processes
seen: set[int] = set()
for profile in list_profiles():
try:
pid = get_running_pid(profile.path / "gateway.pid", cleanup_stale=False)
except Exception:
continue
if pid is None or pid <= 0 or pid in _exclude or pid in seen:
continue
seen.add(pid)
processes.append(ProfileGatewayProcess(profile=profile.name, path=profile.path, pid=pid))
return processes
def _gateway_run_args_for_profile(profile: str) -> list[str]:
args = [get_python_path(), "-m", "hermes_cli.main"]
if profile != "default":
args.extend(["--profile", profile])
args.extend(["gateway", "run", "--replace"])
return args
def launch_detached_profile_gateway_restart(profile: str, old_pid: int) -> bool:
"""Relaunch a manually-run profile gateway after its current PID exits."""
if old_pid <= 0:
return False
watcher = textwrap.dedent(
"""
import os
import subprocess
import sys
import time
pid = int(sys.argv[1])
cmd = sys.argv[2:]
deadline = time.monotonic() + 120
while time.monotonic() < deadline:
try:
os.kill(pid, 0)
except ProcessLookupError:
break
except PermissionError:
pass
time.sleep(0.2)
subprocess.Popen(
cmd,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True,
)
"""
).strip()
try:
subprocess.Popen(
[sys.executable, "-c", watcher, str(old_pid), *_gateway_run_args_for_profile(profile)],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True,
)
except OSError:
return False
return True
def _probe_systemd_service_running(system: bool = False) -> tuple[bool, bool]:
selected_system = _select_systemd_scope(system)
unit_exists = get_systemd_unit_path(system=selected_system).exists()
@ -4377,4 +4462,4 @@ def _gateway_command_inner(args):
if not supports_systemd_services() and not is_macos():
print("Legacy unit migration only applies to systemd-based Linux hosts.")
return
remove_legacy_hermes_units(interactive=not yes, dry_run=dry_run)
remove_legacy_hermes_units(interactive=not yes, dry_run=dry_run)

View file

@ -7137,6 +7137,8 @@ def _cmd_update_impl(args, gateway_mode: bool):
supports_systemd_services,
_ensure_user_systemd_env,
find_gateway_pids,
find_profile_gateway_processes,
launch_detached_profile_gateway_restart,
_get_service_pids,
_graceful_restart_via_sigusr1,
)
@ -7240,6 +7242,7 @@ def _cmd_update_impl(args, gateway_mode: bool):
restarted_services = []
killed_pids = set()
relaunched_profiles = []
# --- Systemd services (Linux) ---
# Discover all hermes-gateway* units (default + profiles)
@ -7429,7 +7432,23 @@ def _cmd_update_impl(args, gateway_mode: bool):
manual_pids = find_gateway_pids(
exclude_pids=service_pids, all_profiles=True
)
profile_processes = {
proc.pid: proc
for proc in find_profile_gateway_processes(exclude_pids=service_pids)
if proc.pid in manual_pids
}
for pid, proc in profile_processes.items():
if launch_detached_profile_gateway_restart(proc.profile, pid):
try:
os.kill(pid, _signal.SIGTERM)
killed_pids.add(pid)
relaunched_profiles.append(proc.profile)
except (ProcessLookupError, PermissionError):
pass
for pid in manual_pids:
if pid in profile_processes:
continue
try:
os.kill(pid, _signal.SIGTERM)
killed_pids.add(pid)
@ -7440,11 +7459,14 @@ def _cmd_update_impl(args, gateway_mode: bool):
print()
for svc in restarted_services:
print(f" ✓ Restarted {svc}")
if killed_pids:
print(f" → Stopped {len(killed_pids)} manual gateway process(es)")
if relaunched_profiles:
names = ", ".join(relaunched_profiles)
print(f" ✓ Restarting manual gateway profile(s): {names}")
unmapped_count = len(killed_pids) - len(relaunched_profiles)
if unmapped_count:
print(f" → Stopped {unmapped_count} manual gateway process(es)")
print(" Restart manually: hermes gateway run")
# Also restart for each profile if needed
if len(killed_pids) > 1:
if unmapped_count > 1:
print(
" (or: hermes -p <profile> gateway run for each profile)"
)

View file

@ -392,6 +392,41 @@ class TestCmdUpdateLaunchdRestart:
captured = capsys.readouterr().out
assert "Restart manually: hermes gateway run" in captured
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_update_restarts_profile_manual_gateways(
self, mock_run, _mock_which, mock_args, capsys, tmp_path, monkeypatch,
):
"""Profile-mapped manual gateways are relaunched automatically after update."""
monkeypatch.setattr(gateway_cli, "is_macos", lambda: True)
monkeypatch.setattr(
gateway_cli,
"get_launchd_plist_path",
lambda: tmp_path / "ai.hermes.gateway.plist",
)
mock_run.side_effect = _make_run_side_effect(
commit_count="3",
launchctl_loaded=False,
)
process = gateway_cli.ProfileGatewayProcess(
profile="coder",
path=tmp_path / ".hermes" / "profiles" / "coder",
pid=12345,
)
with patch.object(gateway_cli, "find_gateway_pids", return_value=[12345]), \
patch.object(gateway_cli, "find_profile_gateway_processes", return_value=[process]), \
patch.object(gateway_cli, "launch_detached_profile_gateway_restart", return_value=True) as restart, \
patch("os.kill") as kill:
cmd_update(mock_args)
captured = capsys.readouterr().out
restart.assert_called_once_with("coder", 12345)
kill.assert_called_once()
assert "Restarting manual gateway profile(s): coder" in captured
assert "Restart manually: hermes gateway run" not in captured
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_update_with_systemd_still_restarts_via_systemd(

View file

@ -28,7 +28,7 @@ When you run `hermes update`, the following steps occur:
2. **Git pull** — pulls the latest code from the `main` branch and updates submodules
3. **Dependency install** — runs `uv pip install -e ".[all]"` to pick up new or changed dependencies
4. **Config migration** — detects new config options added since your version and prompts you to set them
5. **Gateway auto-restart**if the gateway service is running (systemd on Linux, launchd on macOS), it is **automatically restarted** after the update completes so the new code takes effect immediately
5. **Gateway auto-restart**running gateways are refreshed after the update completes so the new code takes effect immediately. Service-managed gateways (systemd on Linux, launchd on macOS) are restarted through the service manager. Manual gateways are relaunched automatically when Hermes can map the running PID back to a profile.
### Preview-only: `hermes update --check`
@ -63,7 +63,7 @@ Already up to date. (or: Updating abc1234..def5678)
✅ Dependencies updated
🔍 Checking for new config options...
✅ Config is up to date (or: Found 2 new options — running migration...)
🔄 Restarting gateway service...
🔄 Restarting gateways...
✅ Gateway restarted
✅ Hermes Agent updated successfully!
```
@ -113,7 +113,7 @@ You can also update directly from Telegram, Discord, Slack, or WhatsApp by sendi
/update
```
This pulls the latest code, updates dependencies, and restarts the gateway. The bot will briefly go offline during the restart (typically 515 seconds) and then resume.
This pulls the latest code, updates dependencies, and restarts running gateways. The bot will briefly go offline during the restart (typically 515 seconds) and then resume.
### Manual Update