From c1809e85e7465239280ab8de7489fff29760bcd2 Mon Sep 17 00:00:00 2001 From: WorldInnovationsDepartment Date: Mon, 13 Apr 2026 12:06:05 +0300 Subject: [PATCH] fix(gateway): handle stale lock files in acquire_scoped_lock Updated the acquire_scoped_lock function to treat empty or corrupt lock files as stale. This change ensures that if a lock file exists but is invalid, it will be removed to prevent issues with stale locks. Added tests to verify recovery from both empty and corrupt lock files. --- gateway/status.py | 9 +++++++++ tests/gateway/test_status.py | 27 +++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/gateway/status.py b/gateway/status.py index d7f357b36..a801cfe5b 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -290,6 +290,15 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, } existing = _read_json_file(lock_path) + if existing is None and lock_path.exists(): + # Lock file exists but is empty or contains invalid JSON — treat as + # stale. This happens when a previous process was killed between + # O_CREAT|O_EXCL and the subsequent json.dump() (e.g. DNS failure + # during rapid Slack reconnect retries). + try: + lock_path.unlink(missing_ok=True) + except OSError: + pass if existing: try: existing_pid = int(existing["pid"]) diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index 16d4bfc5e..4b9675e72 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -209,6 +209,33 @@ class TestScopedLocks: assert payload["pid"] == os.getpid() assert payload["metadata"]["platform"] == "telegram" + def test_acquire_scoped_lock_recovers_empty_lock_file(self, tmp_path, monkeypatch): + """Empty lock file (0 bytes) left by a crashed process should be treated as stale.""" + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_path = tmp_path / "locks" / "slack-app-token-2bb80d537b1da3e3.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock_path.write_text("") # simulate crash between O_CREAT and json.dump + + acquired, existing = status.acquire_scoped_lock("slack-app-token", "secret", metadata={"platform": "slack"}) + + assert acquired is True + payload = json.loads(lock_path.read_text()) + assert payload["pid"] == os.getpid() + assert payload["metadata"]["platform"] == "slack" + + def test_acquire_scoped_lock_recovers_corrupt_lock_file(self, tmp_path, monkeypatch): + """Lock file with invalid JSON should be treated as stale.""" + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_path = tmp_path / "locks" / "slack-app-token-2bb80d537b1da3e3.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock_path.write_text("{truncated") # simulate partial write + + acquired, existing = status.acquire_scoped_lock("slack-app-token", "secret", metadata={"platform": "slack"}) + + assert acquired is True + payload = json.loads(lock_path.read_text()) + assert payload["pid"] == os.getpid() + def test_release_scoped_lock_only_removes_current_owner(self, tmp_path, monkeypatch): monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))