mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(gateway): close --replace race completely by claiming PID before adapter startup
Follow-up on top of opriz's atomic PID file fix. The prior change caught the race AFTER runner.start(), so the loser still opened Telegram polling and Discord gateway sockets before detecting the conflict and exiting. Hoist the PID-claim block to BEFORE runner.start(). Now the loser of the O_CREAT|O_EXCL race returns from start_gateway() without ever bringing up any platform adapter — no Telegram conflict, no Discord duplicate session. Also add regression tests: - test_write_pid_file_is_atomic_against_concurrent_writers: second write_pid_file() raises FileExistsError rather than clobbering. - Two existing replace-path tests updated to stateful mocks since the real post-kill state (get_running_pid None after remove_pid_file) is now exercised by the hoisted re-check.
This commit is contained in:
parent
56b99e8239
commit
ce9c91c8f7
3 changed files with 64 additions and 27 deletions
|
|
@ -10951,6 +10951,30 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
|||
else:
|
||||
logger.info("Skipping signal handlers (not running in main thread).")
|
||||
|
||||
# Claim the PID file BEFORE bringing up any platform adapters.
|
||||
# This closes the --replace race window: two concurrent `gateway run
|
||||
# --replace` invocations both pass the termination-wait above, but
|
||||
# only the winner of the O_CREAT|O_EXCL race below will ever open
|
||||
# Telegram polling, Discord gateway sockets, etc. The loser exits
|
||||
# cleanly before touching any external service.
|
||||
import atexit
|
||||
from gateway.status import write_pid_file, remove_pid_file, get_running_pid
|
||||
_current_pid = get_running_pid()
|
||||
if _current_pid is not None and _current_pid != os.getpid():
|
||||
logger.error(
|
||||
"Another gateway instance (PID %d) started during our startup. "
|
||||
"Exiting to avoid double-running.", _current_pid
|
||||
)
|
||||
return False
|
||||
try:
|
||||
write_pid_file()
|
||||
except FileExistsError:
|
||||
logger.error(
|
||||
"PID file race lost to another gateway instance. Exiting."
|
||||
)
|
||||
return False
|
||||
atexit.register(remove_pid_file)
|
||||
|
||||
# Start the gateway
|
||||
success = await runner.start()
|
||||
if not success:
|
||||
|
|
@ -10960,29 +10984,6 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
|||
logger.error("Gateway exiting cleanly: %s", runner.exit_reason)
|
||||
return True
|
||||
|
||||
# Write PID file so CLI can detect gateway is running
|
||||
import atexit
|
||||
from gateway.status import write_pid_file, remove_pid_file, get_running_pid
|
||||
# Defensive re-check: another --replace racer may have started
|
||||
# while we were initializing. If so, yield and exit.
|
||||
_current_pid = get_running_pid()
|
||||
if _current_pid is not None and _current_pid != os.getpid():
|
||||
logger.error(
|
||||
"Another gateway instance (PID %d) started during our startup. "
|
||||
"Exiting to avoid double-running.", _current_pid
|
||||
)
|
||||
await runner.stop()
|
||||
return False
|
||||
try:
|
||||
write_pid_file()
|
||||
except FileExistsError:
|
||||
logger.error(
|
||||
"PID file race lost to another gateway instance. Exiting."
|
||||
)
|
||||
await runner.stop()
|
||||
return False
|
||||
atexit.register(remove_pid_file)
|
||||
|
||||
# Start background cron ticker so scheduled jobs fire automatically.
|
||||
# Pass the event loop so cron delivery can use live adapters (E2EE support).
|
||||
cron_stop = threading.Event()
|
||||
|
|
|
|||
|
|
@ -184,8 +184,15 @@ async def test_start_gateway_replace_force_uses_terminate_pid(monkeypatch, tmp_p
|
|||
async def stop(self):
|
||||
return None
|
||||
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 42)
|
||||
monkeypatch.setattr("gateway.status.remove_pid_file", lambda: None)
|
||||
# get_running_pid returns 42 before we kill the old gateway, then None
|
||||
# after remove_pid_file() clears the record (reflects real behavior).
|
||||
_pid_state = {"alive": True}
|
||||
def _mock_get_running_pid():
|
||||
return 42 if _pid_state["alive"] else None
|
||||
def _mock_remove_pid_file():
|
||||
_pid_state["alive"] = False
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", _mock_get_running_pid)
|
||||
monkeypatch.setattr("gateway.status.remove_pid_file", _mock_remove_pid_file)
|
||||
monkeypatch.setattr("gateway.status.release_all_scoped_locks", lambda: 0)
|
||||
monkeypatch.setattr("gateway.status.terminate_pid", lambda pid, force=False: calls.append((pid, force)))
|
||||
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
|
||||
|
|
@ -253,8 +260,13 @@ async def test_start_gateway_replace_writes_takeover_marker_before_sigterm(
|
|||
async def stop(self):
|
||||
return None
|
||||
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 42)
|
||||
monkeypatch.setattr("gateway.status.remove_pid_file", lambda: None)
|
||||
_pid_state = {"alive": True}
|
||||
def _mock_get_running_pid():
|
||||
return 42 if _pid_state["alive"] else None
|
||||
def _mock_remove_pid_file():
|
||||
_pid_state["alive"] = False
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", _mock_get_running_pid)
|
||||
monkeypatch.setattr("gateway.status.remove_pid_file", _mock_remove_pid_file)
|
||||
monkeypatch.setattr("gateway.status.release_all_scoped_locks", lambda: 0)
|
||||
monkeypatch.setattr("gateway.status.write_takeover_marker", record_write_marker)
|
||||
monkeypatch.setattr("gateway.status.terminate_pid", record_terminate)
|
||||
|
|
|
|||
|
|
@ -19,6 +19,30 @@ class TestGatewayPidState:
|
|||
assert isinstance(payload["argv"], list)
|
||||
assert payload["argv"]
|
||||
|
||||
def test_write_pid_file_is_atomic_against_concurrent_writers(self, tmp_path, monkeypatch):
|
||||
"""Regression: two concurrent --replace invocations must not both win.
|
||||
|
||||
Without O_CREAT|O_EXCL, two processes racing through start_gateway()'s
|
||||
termination-wait would both write to gateway.pid, silently overwriting
|
||||
each other and leaving multiple gateway instances alive (#11718).
|
||||
"""
|
||||
import pytest
|
||||
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
|
||||
# First write wins.
|
||||
status.write_pid_file()
|
||||
assert (tmp_path / "gateway.pid").exists()
|
||||
|
||||
# Second write (simulating a racing --replace that missed the earlier
|
||||
# guards) must raise FileExistsError rather than clobber the record.
|
||||
with pytest.raises(FileExistsError):
|
||||
status.write_pid_file()
|
||||
|
||||
# Original record is preserved.
|
||||
payload = json.loads((tmp_path / "gateway.pid").read_text())
|
||||
assert payload["pid"] == os.getpid()
|
||||
|
||||
def test_get_running_pid_rejects_live_non_gateway_pid(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
pid_path = tmp_path / "gateway.pid"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue