fix(gateway): handle stale lock files in acquire_scoped_lock

Updated the acquire_scoped_lock function to treat empty or corrupt lock files as stale. This change ensures that if a lock file exists but is invalid, it will be removed to prevent issues with stale locks. Added tests to verify recovery from both empty and corrupt lock files.
This commit is contained in:
WorldInnovationsDepartment 2026-04-13 12:06:05 +03:00 committed by Teknium
parent 23f668d66e
commit c1809e85e7
2 changed files with 36 additions and 0 deletions

View file

@ -290,6 +290,15 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str,
}
existing = _read_json_file(lock_path)
if existing is None and lock_path.exists():
# Lock file exists but is empty or contains invalid JSON — treat as
# stale. This happens when a previous process was killed between
# O_CREAT|O_EXCL and the subsequent json.dump() (e.g. DNS failure
# during rapid Slack reconnect retries).
try:
lock_path.unlink(missing_ok=True)
except OSError:
pass
if existing:
try:
existing_pid = int(existing["pid"])

View file

@ -209,6 +209,33 @@ class TestScopedLocks:
assert payload["pid"] == os.getpid()
assert payload["metadata"]["platform"] == "telegram"
def test_acquire_scoped_lock_recovers_empty_lock_file(self, tmp_path, monkeypatch):
"""Empty lock file (0 bytes) left by a crashed process should be treated as stale."""
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "slack-app-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text("") # simulate crash between O_CREAT and json.dump
acquired, existing = status.acquire_scoped_lock("slack-app-token", "secret", metadata={"platform": "slack"})
assert acquired is True
payload = json.loads(lock_path.read_text())
assert payload["pid"] == os.getpid()
assert payload["metadata"]["platform"] == "slack"
def test_acquire_scoped_lock_recovers_corrupt_lock_file(self, tmp_path, monkeypatch):
"""Lock file with invalid JSON should be treated as stale."""
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "slack-app-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text("{truncated") # simulate partial write
acquired, existing = status.acquire_scoped_lock("slack-app-token", "secret", metadata={"platform": "slack"})
assert acquired is True
payload = json.loads(lock_path.read_text())
assert payload["pid"] == os.getpid()
def test_release_scoped_lock_only_removes_current_owner(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))