diff --git a/gateway/run.py b/gateway/run.py index 3eb932cc2..881f77cb7 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -10954,6 +10954,7 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = from gateway.status import ( acquire_gateway_runtime_lock, get_running_pid, + get_process_start_time, release_gateway_runtime_lock, remove_pid_file, terminate_pid, @@ -10961,6 +10962,7 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = existing_pid = get_running_pid() if existing_pid is not None and existing_pid != os.getpid(): if replace: + existing_start_time = get_process_start_time(existing_pid) logger.info( "Replacing existing gateway instance (PID %d) with --replace.", existing_pid, @@ -11029,7 +11031,10 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = # leaving stale lock files that block the new gateway from starting. try: from gateway.status import release_all_scoped_locks - _released = release_all_scoped_locks() + _released = release_all_scoped_locks( + owner_pid=existing_pid, + owner_start_time=existing_start_time, + ) if _released: logger.info("Released %d stale scoped lock(s) from old gateway.", _released) except Exception: diff --git a/gateway/status.py b/gateway/status.py index 9e373564d..7f7df182f 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -113,6 +113,11 @@ def _get_process_start_time(pid: int) -> Optional[int]: return None +def get_process_start_time(pid: int) -> Optional[int]: + """Public wrapper for retrieving a process start time when available.""" + return _get_process_start_time(pid) + + def _read_process_cmdline(pid: int) -> Optional[str]: """Return the process command line as a space-separated string.""" cmdline_path = Path(f"/proc/{pid}/cmdline") @@ -562,17 +567,43 @@ def release_scoped_lock(scope: str, identity: str) -> None: pass -def release_all_scoped_locks() -> int: - """Remove all scoped lock files in the lock directory. +def release_all_scoped_locks( + *, + owner_pid: Optional[int] = None, + owner_start_time: Optional[int] = None, +) -> int: + """Remove scoped lock files in the lock directory. Called during --replace to clean up stale locks left by stopped/killed - gateway processes that did not release their locks gracefully. + gateway processes that did not release their locks gracefully. When an + ``owner_pid`` is provided, only lock records belonging to that gateway + process are removed. ``owner_start_time`` further narrows the match to + protect against PID reuse. + + When no owner is provided, preserves the legacy behavior and removes every + scoped lock file in the directory. + Returns the number of lock files removed. """ lock_dir = _get_lock_dir() removed = 0 if lock_dir.exists(): for lock_file in lock_dir.glob("*.lock"): + if owner_pid is not None: + record = _read_json_file(lock_file) + if not isinstance(record, dict): + continue + try: + record_pid = int(record.get("pid")) + except (TypeError, ValueError): + continue + if record_pid != owner_pid: + continue + if ( + owner_start_time is not None + and record.get("start_time") != owner_start_time + ): + continue try: lock_file.unlink(missing_ok=True) removed += 1 diff --git a/tests/gateway/test_runner_startup_failures.py b/tests/gateway/test_runner_startup_failures.py index 83ffc0d4d..d94e466ec 100644 --- a/tests/gateway/test_runner_startup_failures.py +++ b/tests/gateway/test_runner_startup_failures.py @@ -193,7 +193,10 @@ async def test_start_gateway_replace_force_uses_terminate_pid(monkeypatch, tmp_p _pid_state["alive"] = False monkeypatch.setattr("gateway.status.get_running_pid", _mock_get_running_pid) monkeypatch.setattr("gateway.status.remove_pid_file", _mock_remove_pid_file) - monkeypatch.setattr("gateway.status.release_all_scoped_locks", lambda: 0) + monkeypatch.setattr( + "gateway.status.release_all_scoped_locks", + lambda **kwargs: 0, + ) monkeypatch.setattr("gateway.status.terminate_pid", lambda pid, force=False: calls.append((pid, force))) monkeypatch.setattr("gateway.run.os.getpid", lambda: 100) monkeypatch.setattr("gateway.run.os.kill", lambda pid, sig: None) @@ -267,7 +270,10 @@ async def test_start_gateway_replace_writes_takeover_marker_before_sigterm( _pid_state["alive"] = False monkeypatch.setattr("gateway.status.get_running_pid", _mock_get_running_pid) monkeypatch.setattr("gateway.status.remove_pid_file", _mock_remove_pid_file) - monkeypatch.setattr("gateway.status.release_all_scoped_locks", lambda: 0) + monkeypatch.setattr( + "gateway.status.release_all_scoped_locks", + lambda **kwargs: 0, + ) monkeypatch.setattr("gateway.status.write_takeover_marker", record_write_marker) monkeypatch.setattr("gateway.status.terminate_pid", record_terminate) monkeypatch.setattr("gateway.run.os.getpid", lambda: 100) diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index f2b6b1b1f..e91bb6e41 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -404,6 +404,53 @@ class TestScopedLocks: status.release_scoped_lock("telegram-bot-token", "secret") assert not lock_path.exists() + def test_release_all_scoped_locks_can_target_single_owner(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_dir = tmp_path / "locks" + lock_dir.mkdir(parents=True, exist_ok=True) + + target_lock = lock_dir / "telegram-bot-token-target.lock" + other_lock = lock_dir / "slack-app-token-other.lock" + target_lock.write_text(json.dumps({ + "pid": 111, + "start_time": 222, + "kind": "hermes-gateway", + })) + other_lock.write_text(json.dumps({ + "pid": 999, + "start_time": 333, + "kind": "hermes-gateway", + })) + + removed = status.release_all_scoped_locks( + owner_pid=111, + owner_start_time=222, + ) + + assert removed == 1 + assert not target_lock.exists() + assert other_lock.exists() + + def test_release_all_scoped_locks_skips_pid_reuse_mismatch(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_dir = tmp_path / "locks" + lock_dir.mkdir(parents=True, exist_ok=True) + + reused_pid_lock = lock_dir / "telegram-bot-token-reused.lock" + reused_pid_lock.write_text(json.dumps({ + "pid": 111, + "start_time": 999, + "kind": "hermes-gateway", + })) + + removed = status.release_all_scoped_locks( + owner_pid=111, + owner_start_time=222, + ) + + assert removed == 0 + assert reused_pid_lock.exists() + class TestTakeoverMarker: """Tests for the --replace takeover marker.