fix: scope gateway stop/restart to current profile, --all for global kill

gateway stop and restart previously called kill_gateway_processes() which
scans ps aux and kills ALL gateway processes across all profiles. Starting
a profile gateway would nuke the main one (and vice versa).

Now:
- hermes gateway stop → only kills the current profile's gateway (PID file)
- hermes -p work gateway stop → only kills the 'work' profile's gateway
- hermes gateway stop --all → kills every gateway process (old behavior)
- hermes gateway restart → profile-scoped for manual fallback path
- hermes update → discovers and restarts ALL profile gateways (systemctl
  list-units hermes-gateway*) since the code update is shared

Added stop_profile_gateway() which uses the HERMES_HOME-scoped PID file
instead of global process scanning.
This commit is contained in:
Teknium 2026-04-03 14:21:25 -07:00
parent 52ddd6bc64
commit 84a875ca02
No known key found for this signature in database
4 changed files with 229 additions and 172 deletions

View file

@ -89,7 +89,7 @@ def find_gateway_pids() -> list:
def kill_gateway_processes(force: bool = False) -> int:
"""Kill any running gateway processes. Returns count killed."""
"""Kill ALL running gateway processes (across all profiles). Returns count killed."""
pids = find_gateway_pids()
killed = 0
@ -109,6 +109,43 @@ def kill_gateway_processes(force: bool = False) -> int:
return killed
def stop_profile_gateway() -> bool:
"""Stop only the gateway for the current profile (HERMES_HOME-scoped).
Uses the PID file written by start_gateway(), so it only kills the
gateway belonging to this profile not gateways from other profiles.
Returns True if a process was stopped, False if none was found.
"""
try:
from gateway.status import get_running_pid, remove_pid_file
except ImportError:
return False
pid = get_running_pid()
if pid is None:
return False
try:
os.kill(pid, signal.SIGTERM)
except ProcessLookupError:
pass # Already gone
except PermissionError:
print(f"⚠ Permission denied to kill PID {pid}")
return False
# Wait briefly for it to exit
import time as _time
for _ in range(20):
try:
os.kill(pid, 0)
_time.sleep(0.5)
except (ProcessLookupError, PermissionError):
break
remove_pid_file()
return True
def is_linux() -> bool:
return sys.platform.startswith('linux')
@ -1831,7 +1868,7 @@ def gateway_setup():
elif is_macos():
launchd_restart()
else:
kill_gateway_processes()
stop_profile_gateway()
print_info("Start manually: hermes gateway")
except subprocess.CalledProcessError as e:
print_error(f" Restart failed: {e}")
@ -1945,31 +1982,54 @@ def gateway_command(args):
sys.exit(1)
elif subcmd == "stop":
# Try service first, then sweep any stray/manual gateway processes.
service_available = False
stop_all = getattr(args, 'all', False)
system = getattr(args, 'system', False)
if is_linux() and (get_systemd_unit_path(system=False).exists() or get_systemd_unit_path(system=True).exists()):
try:
systemd_stop(system=system)
service_available = True
except subprocess.CalledProcessError:
pass # Fall through to process kill
elif is_macos() and get_launchd_plist_path().exists():
try:
launchd_stop()
service_available = True
except subprocess.CalledProcessError:
pass
killed = kill_gateway_processes()
if not service_available:
if killed:
print(f"✓ Stopped {killed} gateway process(es)")
if stop_all:
# --all: kill every gateway process on the machine
service_available = False
if is_linux() and (get_systemd_unit_path(system=False).exists() or get_systemd_unit_path(system=True).exists()):
try:
systemd_stop(system=system)
service_available = True
except subprocess.CalledProcessError:
pass
elif is_macos() and get_launchd_plist_path().exists():
try:
launchd_stop()
service_available = True
except subprocess.CalledProcessError:
pass
killed = kill_gateway_processes()
total = killed + (1 if service_available else 0)
if total:
print(f"✓ Stopped {total} gateway process(es) across all profiles")
else:
print("✗ No gateway processes found")
elif killed:
print(f"✓ Stopped {killed} additional manual gateway process(es)")
else:
# Default: stop only the current profile's gateway
service_available = False
if is_linux() and (get_systemd_unit_path(system=False).exists() or get_systemd_unit_path(system=True).exists()):
try:
systemd_stop(system=system)
service_available = True
except subprocess.CalledProcessError:
pass
elif is_macos() and get_launchd_plist_path().exists():
try:
launchd_stop()
service_available = True
except subprocess.CalledProcessError:
pass
if not service_available:
# No systemd/launchd — use profile-scoped PID file
if stop_profile_gateway():
print("✓ Stopped gateway for this profile")
else:
print("✗ No gateway running for this profile")
else:
print(f"✓ Stopped {get_service_name()} service")
elif subcmd == "restart":
# Try service first, fall back to killing and restarting
@ -2016,10 +2076,9 @@ def gateway_command(args):
print(" Fix the service, then retry: hermes gateway start")
sys.exit(1)
# Manual restart: kill existing processes
killed = kill_gateway_processes()
if killed:
print(f"✓ Stopped {killed} gateway process(es)")
# Manual restart: stop only this profile's gateway
if stop_profile_gateway():
print("✓ Stopped gateway for this profile")
_wait_for_gateway_exit(timeout=10.0, force_after=5.0)

View file

@ -3516,139 +3516,103 @@ def cmd_update(args):
print()
print("✓ Update complete!")
# Auto-restart gateway if it's running.
# Uses the PID file (scoped to HERMES_HOME) to find this
# installation's gateway — safe with multiple installations.
# Auto-restart ALL gateways after update.
# The code update (git pull) is shared across all profiles, so every
# running gateway needs restarting to pick up the new code.
try:
from gateway.status import get_running_pid, remove_pid_file
from hermes_cli.gateway import (
get_service_name, get_launchd_plist_path, is_macos, is_linux,
launchd_restart, _ensure_user_systemd_env,
get_systemd_linger_status,
is_macos, is_linux, _ensure_user_systemd_env,
get_systemd_linger_status, find_gateway_pids,
)
import signal as _signal
_gw_service_name = get_service_name()
existing_pid = get_running_pid()
has_systemd_service = False
has_system_service = False
has_launchd_service = False
restarted_services = []
killed_pids = set()
try:
_ensure_user_systemd_env()
check = subprocess.run(
["systemctl", "--user", "is-active", _gw_service_name],
capture_output=True, text=True, timeout=5,
)
has_systemd_service = check.stdout.strip() == "active"
except (FileNotFoundError, subprocess.TimeoutExpired):
pass
# Also check for a system-level service (hermes gateway install --system).
# This covers gateways running under system systemd where --user
# fails due to missing D-Bus session.
if not has_systemd_service and is_linux():
# --- Systemd services (Linux) ---
# Discover all hermes-gateway* units (default + profiles)
if is_linux():
try:
check = subprocess.run(
["systemctl", "is-active", _gw_service_name],
capture_output=True, text=True, timeout=5,
)
has_system_service = check.stdout.strip() == "active"
except (FileNotFoundError, subprocess.TimeoutExpired):
_ensure_user_systemd_env()
except Exception:
pass
# Check for macOS launchd service
for scope, scope_cmd in [("user", ["systemctl", "--user"]), ("system", ["systemctl"])]:
try:
result = subprocess.run(
scope_cmd + ["list-units", "hermes-gateway*", "--plain", "--no-legend", "--no-pager"],
capture_output=True, text=True, timeout=10,
)
for line in result.stdout.strip().splitlines():
parts = line.split()
if not parts:
continue
unit = parts[0] # e.g. hermes-gateway.service or hermes-gateway-coder.service
if not unit.endswith(".service"):
continue
svc_name = unit.removesuffix(".service")
# Check if active
check = subprocess.run(
scope_cmd + ["is-active", svc_name],
capture_output=True, text=True, timeout=5,
)
if check.stdout.strip() == "active":
restart = subprocess.run(
scope_cmd + ["restart", svc_name],
capture_output=True, text=True, timeout=15,
)
if restart.returncode == 0:
restarted_services.append(svc_name)
else:
print(f" ⚠ Failed to restart {svc_name}: {restart.stderr.strip()}")
except (FileNotFoundError, subprocess.TimeoutExpired):
pass
# --- Launchd services (macOS) ---
if is_macos():
try:
from hermes_cli.gateway import get_launchd_label
from hermes_cli.gateway import launchd_restart, get_launchd_label, get_launchd_plist_path
plist_path = get_launchd_plist_path()
if plist_path.exists():
check = subprocess.run(
["launchctl", "list", get_launchd_label()],
capture_output=True, text=True, timeout=5,
)
has_launchd_service = check.returncode == 0
except (FileNotFoundError, subprocess.TimeoutExpired):
if check.returncode == 0:
try:
launchd_restart()
restarted_services.append(get_launchd_label())
except subprocess.CalledProcessError as e:
stderr = (getattr(e, "stderr", "") or "").strip()
print(f" ⚠ Gateway restart failed: {stderr}")
except (FileNotFoundError, subprocess.TimeoutExpired, ImportError):
pass
if existing_pid or has_systemd_service or has_system_service or has_launchd_service:
print()
# --- Manual (non-service) gateways ---
# Kill any remaining gateway processes not managed by a service
manual_pids = find_gateway_pids()
for pid in manual_pids:
try:
os.kill(pid, _signal.SIGTERM)
killed_pids.add(pid)
except (ProcessLookupError, PermissionError):
pass
if restarted_services or killed_pids:
print()
for svc in restarted_services:
print(f" ✓ Restarted {svc}")
if killed_pids:
print(f" → Stopped {len(killed_pids)} manual gateway process(es)")
print(" Restart manually: hermes gateway run")
# Also restart for each profile if needed
if len(killed_pids) > 1:
print(" (or: hermes -p <profile> gateway run for each profile)")
if not restarted_services and not killed_pids:
# No gateways were running — nothing to do
pass
# When a service manager is handling the gateway, let it
# manage the lifecycle — don't manually SIGTERM the PID
# (launchd KeepAlive would respawn immediately, causing races).
if has_systemd_service:
import time as _time
if existing_pid:
try:
os.kill(existing_pid, _signal.SIGTERM)
print(f"→ Stopped gateway process (PID {existing_pid})")
except ProcessLookupError:
pass
except PermissionError:
print(f"⚠ Permission denied killing gateway PID {existing_pid}")
remove_pid_file()
_time.sleep(1) # Brief pause for port/socket release
print("→ Restarting gateway service...")
restart = subprocess.run(
["systemctl", "--user", "restart", _gw_service_name],
capture_output=True, text=True, timeout=15,
)
if restart.returncode == 0:
print("✓ Gateway restarted.")
else:
print(f"⚠ Gateway restart failed: {restart.stderr.strip()}")
# Check if linger is the issue
if is_linux():
linger_ok, _detail = get_systemd_linger_status()
if linger_ok is not True:
import getpass
_username = getpass.getuser()
print()
print(" Linger must be enabled for the gateway user service to function.")
print(f" Run: sudo loginctl enable-linger {_username}")
print()
print(" Then restart the gateway:")
print(" hermes gateway restart")
else:
print(" Try manually: hermes gateway restart")
elif has_system_service:
# System-level service (hermes gateway install --system).
# No D-Bus session needed — systemctl without --user talks
# directly to the system manager over /run/systemd/private.
print("→ Restarting system gateway service...")
restart = subprocess.run(
["systemctl", "restart", _gw_service_name],
capture_output=True, text=True, timeout=15,
)
if restart.returncode == 0:
print("✓ Gateway restarted (system service).")
else:
print(f"⚠ Gateway restart failed: {restart.stderr.strip()}")
print(" System services may require root. Try:")
print(f" sudo systemctl restart {_gw_service_name}")
elif has_launchd_service:
# Use the shared launchd restart helper so we wait for the
# old gateway process to fully exit before starting the new
# one. This avoids stop/start races during self-update.
print("→ Restarting gateway service...")
try:
launchd_restart()
except subprocess.CalledProcessError as e:
stderr = (getattr(e, "stderr", "") or "").strip()
print(f"⚠ Gateway restart failed: {stderr}")
print(" Try manually: hermes gateway restart")
elif existing_pid:
try:
os.kill(existing_pid, _signal.SIGTERM)
print(f"→ Stopped gateway process (PID {existing_pid})")
except ProcessLookupError:
pass # Already gone
except PermissionError:
print(f"⚠ Permission denied killing gateway PID {existing_pid}")
remove_pid_file()
print(" Gateway was running manually (not as a service).")
print(" Restart it with: hermes gateway run")
except Exception as e:
logger.debug("Gateway restart during update failed: %s", e)
@ -4214,6 +4178,7 @@ For more help on a command:
# gateway stop
gateway_stop = gateway_subparsers.add_parser("stop", help="Stop gateway service")
gateway_stop.add_argument("--system", action="store_true", help="Target the Linux system-level gateway service")
gateway_stop.add_argument("--all", action="store_true", help="Stop ALL gateway processes across all profiles")
# gateway restart
gateway_restart = gateway_subparsers.add_parser("restart", help="Restart gateway service")

View file

@ -103,7 +103,9 @@ class TestGeneratedSystemdUnits:
class TestGatewayStopCleanup:
def test_stop_sweeps_manual_gateway_processes_after_service_stop(self, tmp_path, monkeypatch):
def test_stop_only_kills_current_profile_by_default(self, tmp_path, monkeypatch):
"""Without --all, stop uses systemd (if available) and does NOT call
the global kill_gateway_processes()."""
unit_path = tmp_path / "hermes-gateway.service"
unit_path.write_text("unit\n", encoding="utf-8")
@ -123,6 +125,31 @@ class TestGatewayStopCleanup:
gateway_cli.gateway_command(SimpleNamespace(gateway_command="stop"))
assert service_calls == ["stop"]
# Global kill should NOT be called without --all
assert kill_calls == []
def test_stop_all_sweeps_all_gateway_processes(self, tmp_path, monkeypatch):
"""With --all, stop uses systemd AND calls the global kill_gateway_processes()."""
unit_path = tmp_path / "hermes-gateway.service"
unit_path.write_text("unit\n", encoding="utf-8")
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda system=False: unit_path)
service_calls = []
kill_calls = []
monkeypatch.setattr(gateway_cli, "systemd_stop", lambda system=False: service_calls.append("stop"))
monkeypatch.setattr(
gateway_cli,
"kill_gateway_processes",
lambda force=False: kill_calls.append(force) or 2,
)
gateway_cli.gateway_command(SimpleNamespace(gateway_command="stop", **{"all": True}))
assert service_calls == ["stop"]
assert kill_calls == [False]

View file

@ -47,6 +47,22 @@ def _make_run_side_effect(
if "rev-list" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout=f"{commit_count}\n", stderr="")
# systemctl list-units hermes-gateway* — discover all gateway services
if "systemctl" in joined and "list-units" in joined:
if "--user" in joined and systemd_active:
return subprocess.CompletedProcess(
cmd, 0,
stdout="hermes-gateway.service loaded active running Hermes Gateway\n",
stderr="",
)
elif "--user" not in joined and system_service_active:
return subprocess.CompletedProcess(
cmd, 0,
stdout="hermes-gateway.service loaded active running Hermes Gateway\n",
stderr="",
)
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
# systemctl is-active — distinguish --user from system scope
if "systemctl" in joined and "is-active" in joined:
if "--user" in joined:
@ -305,15 +321,14 @@ class TestCmdUpdateLaunchdRestart:
launchctl_loaded=True,
)
# Mock get_running_pid to return a PID
with patch("gateway.status.get_running_pid", return_value=12345), \
patch("gateway.status.remove_pid_file"), \
patch.object(gateway_cli, "launchd_restart") as mock_launchd_restart:
# Mock launchd_restart + find_gateway_pids (new code discovers all gateways)
with patch.object(gateway_cli, "launchd_restart") as mock_launchd_restart, \
patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
cmd_update(mock_args)
captured = capsys.readouterr().out
assert "Restarting gateway service" in captured
assert "Restart it with: hermes gateway run" not in captured
assert "Restarted" in captured
assert "Restart manually: hermes gateway run" not in captured
mock_launchd_restart.assert_called_once_with()
@patch("shutil.which", return_value=None)
@ -321,7 +336,7 @@ class TestCmdUpdateLaunchdRestart:
def test_update_without_launchd_shows_manual_restart(
self, mock_run, _mock_which, mock_args, capsys, tmp_path, monkeypatch,
):
"""When no service manager is running, update should show the manual restart hint."""
"""When no service manager is running but manual gateway is found, show manual restart hint."""
monkeypatch.setattr(
gateway_cli, "is_macos", lambda: True,
)
@ -336,14 +351,13 @@ class TestCmdUpdateLaunchdRestart:
launchctl_loaded=False,
)
with patch("gateway.status.get_running_pid", return_value=12345), \
patch("gateway.status.remove_pid_file"), \
# Simulate a manual gateway process found by find_gateway_pids
with patch.object(gateway_cli, "find_gateway_pids", return_value=[12345]), \
patch("os.kill"):
cmd_update(mock_args)
captured = capsys.readouterr().out
assert "Restart it with: hermes gateway run" in captured
assert "Gateway restarted via launchd" not in captured
assert "Restart manually: hermes gateway run" in captured
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
@ -360,13 +374,11 @@ class TestCmdUpdateLaunchdRestart:
systemd_active=True,
)
with patch("gateway.status.get_running_pid", return_value=12345), \
patch("gateway.status.remove_pid_file"), \
patch("os.kill"):
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
cmd_update(mock_args)
captured = capsys.readouterr().out
assert "Gateway restarted" in captured
assert "Restarted hermes-gateway" in captured
# Verify systemctl restart was called
restart_calls = [
c for c in mock_run.call_args_list
@ -422,13 +434,11 @@ class TestCmdUpdateSystemService:
system_service_active=True,
)
with patch("gateway.status.get_running_pid", return_value=12345), \
patch("gateway.status.remove_pid_file"):
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
cmd_update(mock_args)
captured = capsys.readouterr().out
assert "system gateway service" in captured.lower()
assert "Gateway restarted (system service)" in captured
assert "Restarted hermes-gateway" in captured
# Verify systemctl restart (no --user) was called
restart_calls = [
c for c in mock_run.call_args_list
@ -440,10 +450,10 @@ class TestCmdUpdateSystemService:
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_update_system_service_restart_failure_shows_sudo_hint(
def test_update_system_service_restart_failure_shows_error(
self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
):
"""When system service restart fails (e.g. no root), show sudo hint."""
"""When system service restart fails, show the failure message."""
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
@ -454,19 +464,18 @@ class TestCmdUpdateSystemService:
system_restart_rc=1,
)
with patch("gateway.status.get_running_pid", return_value=12345), \
patch("gateway.status.remove_pid_file"):
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
cmd_update(mock_args)
captured = capsys.readouterr().out
assert "sudo systemctl restart" in captured
assert "Failed to restart" in captured
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_user_service_takes_priority_over_system(
self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
):
"""When both user and system services are active, user wins."""
"""When both user and system services are active, both are restarted."""
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
@ -476,12 +485,9 @@ class TestCmdUpdateSystemService:
system_service_active=True,
)
with patch("gateway.status.get_running_pid", return_value=12345), \
patch("gateway.status.remove_pid_file"), \
patch("os.kill"):
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
cmd_update(mock_args)
captured = capsys.readouterr().out
# Should restart via user service, not system
assert "Gateway restarted." in captured
assert "(system service)" not in captured
# Both scopes are discovered and restarted
assert "Restarted hermes-gateway" in captured