mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(gateway): PID-based wait with force-kill for gateway restart
Add _wait_for_gateway_exit() that polls get_running_pid() to confirm the old gateway process has actually exited before starting a new one. If the process doesn't exit within 5s, sends SIGKILL to the specific PID. Uses the saved PID from gateway.pid (not launchd labels) so it works correctly with multiple gateway instances under separate HERMES_HOME directories. Applied to both launchd_restart() and the manual restart path (replaces the blind time.sleep(2)). Inspired by PR #1881 by @AzothZephyr (race condition diagnosis). Adds 4 tests.
This commit is contained in:
parent
24ac577046
commit
ace2cc6257
2 changed files with 127 additions and 4 deletions
|
|
@ -849,6 +849,46 @@ def launchd_stop():
|
|||
subprocess.run(["launchctl", "stop", "ai.hermes.gateway"], check=True)
|
||||
print("✓ Service stopped")
|
||||
|
||||
def _wait_for_gateway_exit(timeout: float = 10.0, force_after: float = 5.0):
|
||||
"""Wait for the gateway process (by saved PID) to exit.
|
||||
|
||||
Uses the PID from the gateway.pid file — not launchd labels — so this
|
||||
works correctly when multiple gateway instances run under separate
|
||||
HERMES_HOME directories.
|
||||
|
||||
Args:
|
||||
timeout: Total seconds to wait before giving up.
|
||||
force_after: Seconds of graceful waiting before sending SIGKILL.
|
||||
"""
|
||||
import time
|
||||
from gateway.status import get_running_pid
|
||||
|
||||
deadline = time.monotonic() + timeout
|
||||
force_deadline = time.monotonic() + force_after
|
||||
force_sent = False
|
||||
|
||||
while time.monotonic() < deadline:
|
||||
pid = get_running_pid()
|
||||
if pid is None:
|
||||
return # Process exited cleanly.
|
||||
|
||||
if not force_sent and time.monotonic() >= force_deadline:
|
||||
# Grace period expired — force-kill the specific PID.
|
||||
try:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
print(f"⚠ Gateway PID {pid} did not exit gracefully; sent SIGKILL")
|
||||
except (ProcessLookupError, PermissionError):
|
||||
return # Already gone or we can't touch it.
|
||||
force_sent = True
|
||||
|
||||
time.sleep(0.3)
|
||||
|
||||
# Timed out even after SIGKILL.
|
||||
remaining_pid = get_running_pid()
|
||||
if remaining_pid is not None:
|
||||
print(f"⚠ Gateway PID {remaining_pid} still running after {timeout}s — restart may fail")
|
||||
|
||||
|
||||
def launchd_restart():
|
||||
try:
|
||||
launchd_stop()
|
||||
|
|
@ -856,6 +896,7 @@ def launchd_restart():
|
|||
if e.returncode != 3:
|
||||
raise
|
||||
print("↻ launchd job was unloaded; skipping stop")
|
||||
_wait_for_gateway_exit()
|
||||
launchd_start()
|
||||
|
||||
def launchd_status(deep: bool = False):
|
||||
|
|
@ -1753,10 +1794,9 @@ def gateway_command(args):
|
|||
killed = kill_gateway_processes()
|
||||
if killed:
|
||||
print(f"✓ Stopped {killed} gateway process(es)")
|
||||
|
||||
import time
|
||||
time.sleep(2)
|
||||
|
||||
|
||||
_wait_for_gateway_exit(timeout=10.0, force_after=5.0)
|
||||
|
||||
# Start fresh
|
||||
print("Starting gateway...")
|
||||
run_gateway(verbose=False)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue