diff --git a/gateway/status.py b/gateway/status.py index d7f357b363..a801cfe5b8 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -290,6 +290,15 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, } existing = _read_json_file(lock_path) + if existing is None and lock_path.exists(): + # Lock file exists but is empty or contains invalid JSON — treat as + # stale. This happens when a previous process was killed between + # O_CREAT|O_EXCL and the subsequent json.dump() (e.g. DNS failure + # during rapid Slack reconnect retries). + try: + lock_path.unlink(missing_ok=True) + except OSError: + pass if existing: try: existing_pid = int(existing["pid"]) diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index 16d4bfc5e8..4b9675e723 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -209,6 +209,33 @@ class TestScopedLocks: assert payload["pid"] == os.getpid() assert payload["metadata"]["platform"] == "telegram" + def test_acquire_scoped_lock_recovers_empty_lock_file(self, tmp_path, monkeypatch): + """Empty lock file (0 bytes) left by a crashed process should be treated as stale.""" + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_path = tmp_path / "locks" / "slack-app-token-2bb80d537b1da3e3.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock_path.write_text("") # simulate crash between O_CREAT and json.dump + + acquired, existing = status.acquire_scoped_lock("slack-app-token", "secret", metadata={"platform": "slack"}) + + assert acquired is True + payload = json.loads(lock_path.read_text()) + assert payload["pid"] == os.getpid() + assert payload["metadata"]["platform"] == "slack" + + def test_acquire_scoped_lock_recovers_corrupt_lock_file(self, tmp_path, monkeypatch): + """Lock file with invalid JSON should be treated as stale.""" + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_path = tmp_path / "locks" / "slack-app-token-2bb80d537b1da3e3.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock_path.write_text("{truncated") # simulate partial write + + acquired, existing = status.acquire_scoped_lock("slack-app-token", "secret", metadata={"platform": "slack"}) + + assert acquired is True + payload = json.loads(lock_path.read_text()) + assert payload["pid"] == os.getpid() + def test_release_scoped_lock_only_removes_current_owner(self, tmp_path, monkeypatch): monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))