mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(gateway): only clear locks belonging to the replaced process
This commit is contained in:
parent
a0d8dd7ba3
commit
d0821b0573
4 changed files with 95 additions and 6 deletions
|
|
@ -10954,6 +10954,7 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
|||
from gateway.status import (
|
||||
acquire_gateway_runtime_lock,
|
||||
get_running_pid,
|
||||
get_process_start_time,
|
||||
release_gateway_runtime_lock,
|
||||
remove_pid_file,
|
||||
terminate_pid,
|
||||
|
|
@ -10961,6 +10962,7 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
|||
existing_pid = get_running_pid()
|
||||
if existing_pid is not None and existing_pid != os.getpid():
|
||||
if replace:
|
||||
existing_start_time = get_process_start_time(existing_pid)
|
||||
logger.info(
|
||||
"Replacing existing gateway instance (PID %d) with --replace.",
|
||||
existing_pid,
|
||||
|
|
@ -11029,7 +11031,10 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
|||
# leaving stale lock files that block the new gateway from starting.
|
||||
try:
|
||||
from gateway.status import release_all_scoped_locks
|
||||
_released = release_all_scoped_locks()
|
||||
_released = release_all_scoped_locks(
|
||||
owner_pid=existing_pid,
|
||||
owner_start_time=existing_start_time,
|
||||
)
|
||||
if _released:
|
||||
logger.info("Released %d stale scoped lock(s) from old gateway.", _released)
|
||||
except Exception:
|
||||
|
|
|
|||
|
|
@ -113,6 +113,11 @@ def _get_process_start_time(pid: int) -> Optional[int]:
|
|||
return None
|
||||
|
||||
|
||||
def get_process_start_time(pid: int) -> Optional[int]:
|
||||
"""Public wrapper for retrieving a process start time when available."""
|
||||
return _get_process_start_time(pid)
|
||||
|
||||
|
||||
def _read_process_cmdline(pid: int) -> Optional[str]:
|
||||
"""Return the process command line as a space-separated string."""
|
||||
cmdline_path = Path(f"/proc/{pid}/cmdline")
|
||||
|
|
@ -562,17 +567,43 @@ def release_scoped_lock(scope: str, identity: str) -> None:
|
|||
pass
|
||||
|
||||
|
||||
def release_all_scoped_locks() -> int:
|
||||
"""Remove all scoped lock files in the lock directory.
|
||||
def release_all_scoped_locks(
|
||||
*,
|
||||
owner_pid: Optional[int] = None,
|
||||
owner_start_time: Optional[int] = None,
|
||||
) -> int:
|
||||
"""Remove scoped lock files in the lock directory.
|
||||
|
||||
Called during --replace to clean up stale locks left by stopped/killed
|
||||
gateway processes that did not release their locks gracefully.
|
||||
gateway processes that did not release their locks gracefully. When an
|
||||
``owner_pid`` is provided, only lock records belonging to that gateway
|
||||
process are removed. ``owner_start_time`` further narrows the match to
|
||||
protect against PID reuse.
|
||||
|
||||
When no owner is provided, preserves the legacy behavior and removes every
|
||||
scoped lock file in the directory.
|
||||
|
||||
Returns the number of lock files removed.
|
||||
"""
|
||||
lock_dir = _get_lock_dir()
|
||||
removed = 0
|
||||
if lock_dir.exists():
|
||||
for lock_file in lock_dir.glob("*.lock"):
|
||||
if owner_pid is not None:
|
||||
record = _read_json_file(lock_file)
|
||||
if not isinstance(record, dict):
|
||||
continue
|
||||
try:
|
||||
record_pid = int(record.get("pid"))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if record_pid != owner_pid:
|
||||
continue
|
||||
if (
|
||||
owner_start_time is not None
|
||||
and record.get("start_time") != owner_start_time
|
||||
):
|
||||
continue
|
||||
try:
|
||||
lock_file.unlink(missing_ok=True)
|
||||
removed += 1
|
||||
|
|
|
|||
|
|
@ -193,7 +193,10 @@ async def test_start_gateway_replace_force_uses_terminate_pid(monkeypatch, tmp_p
|
|||
_pid_state["alive"] = False
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", _mock_get_running_pid)
|
||||
monkeypatch.setattr("gateway.status.remove_pid_file", _mock_remove_pid_file)
|
||||
monkeypatch.setattr("gateway.status.release_all_scoped_locks", lambda: 0)
|
||||
monkeypatch.setattr(
|
||||
"gateway.status.release_all_scoped_locks",
|
||||
lambda **kwargs: 0,
|
||||
)
|
||||
monkeypatch.setattr("gateway.status.terminate_pid", lambda pid, force=False: calls.append((pid, force)))
|
||||
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
|
||||
monkeypatch.setattr("gateway.run.os.kill", lambda pid, sig: None)
|
||||
|
|
@ -267,7 +270,10 @@ async def test_start_gateway_replace_writes_takeover_marker_before_sigterm(
|
|||
_pid_state["alive"] = False
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", _mock_get_running_pid)
|
||||
monkeypatch.setattr("gateway.status.remove_pid_file", _mock_remove_pid_file)
|
||||
monkeypatch.setattr("gateway.status.release_all_scoped_locks", lambda: 0)
|
||||
monkeypatch.setattr(
|
||||
"gateway.status.release_all_scoped_locks",
|
||||
lambda **kwargs: 0,
|
||||
)
|
||||
monkeypatch.setattr("gateway.status.write_takeover_marker", record_write_marker)
|
||||
monkeypatch.setattr("gateway.status.terminate_pid", record_terminate)
|
||||
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
|
||||
|
|
|
|||
|
|
@ -404,6 +404,53 @@ class TestScopedLocks:
|
|||
status.release_scoped_lock("telegram-bot-token", "secret")
|
||||
assert not lock_path.exists()
|
||||
|
||||
def test_release_all_scoped_locks_can_target_single_owner(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
|
||||
lock_dir = tmp_path / "locks"
|
||||
lock_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
target_lock = lock_dir / "telegram-bot-token-target.lock"
|
||||
other_lock = lock_dir / "slack-app-token-other.lock"
|
||||
target_lock.write_text(json.dumps({
|
||||
"pid": 111,
|
||||
"start_time": 222,
|
||||
"kind": "hermes-gateway",
|
||||
}))
|
||||
other_lock.write_text(json.dumps({
|
||||
"pid": 999,
|
||||
"start_time": 333,
|
||||
"kind": "hermes-gateway",
|
||||
}))
|
||||
|
||||
removed = status.release_all_scoped_locks(
|
||||
owner_pid=111,
|
||||
owner_start_time=222,
|
||||
)
|
||||
|
||||
assert removed == 1
|
||||
assert not target_lock.exists()
|
||||
assert other_lock.exists()
|
||||
|
||||
def test_release_all_scoped_locks_skips_pid_reuse_mismatch(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
|
||||
lock_dir = tmp_path / "locks"
|
||||
lock_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
reused_pid_lock = lock_dir / "telegram-bot-token-reused.lock"
|
||||
reused_pid_lock.write_text(json.dumps({
|
||||
"pid": 111,
|
||||
"start_time": 999,
|
||||
"kind": "hermes-gateway",
|
||||
}))
|
||||
|
||||
removed = status.release_all_scoped_locks(
|
||||
owner_pid=111,
|
||||
owner_start_time=222,
|
||||
)
|
||||
|
||||
assert removed == 0
|
||||
assert reused_pid_lock.exists()
|
||||
|
||||
|
||||
class TestTakeoverMarker:
|
||||
"""Tests for the --replace takeover marker.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue