fix(gateway): mark platform lock failure as retryable instead of permanently fatal

When a stale lock file survives a gateway crash, `acquire_scoped_lock()`
may return `(False, existing_dict)` even after detecting and deleting
the stale lock (e.g. if unlink fails or a race condition occurs).

Previously, `_acquire_platform_lock()` called
`_set_fatal_error(..., retryable=False)`, which permanently killed the
platform — the reconnect watcher never retries a non-retryable fatal
error.

Change to `retryable=True` so the platform enters the "retrying"
state and the reconnect watcher can attempt acquisition again after the
standard backoff delay.

Fixes #54167
This commit is contained in:
liuhao1024 2026-06-28 18:06:51 +08:00 committed by Teknium
parent 61622bb56a
commit 9d919daf44
2 changed files with 74 additions and 1 deletions

View file

@ -2592,7 +2592,7 @@ class BasePlatformAdapter(ABC):
+ '. Stop the other gateway first.'
)
logger.error('[%s] %s', self.name, message)
self._set_fatal_error(f'{scope}_lock', message, retryable=False)
self._set_fatal_error(f'{scope}_lock', message, retryable=True)
return False
def _release_platform_lock(self) -> None:

View file

@ -0,0 +1,73 @@
"""Regression test for #54167 — stale platform lock must be retryable.
When a gateway process is killed (SIGKILL, crash) during Telegram
initialization, the scoped lock file survives. On next startup,
``acquire_scoped_lock()`` detects the stale lock and deletes it, but may
still return ``(False, existing_dict)`` to the caller (e.g. if the
unlink fails due to permissions, or a race condition lets another
process grab the lock first).
``_acquire_platform_lock()`` must mark such failures as **retryable**
so the reconnect watcher can retry after a delay not permanently kill
the platform.
Contract asserted here
----------------------
``_set_fatal_error`` is called with ``retryable=True`` when lock
acquisition fails, regardless of the reason.
"""
from typing import Any, Dict
from unittest.mock import MagicMock, patch
import pytest
from gateway.platforms.base import BasePlatformAdapter
class _StubAdapter(BasePlatformAdapter):
"""Minimal concrete subclass for testing _acquire_platform_lock."""
platform = MagicMock(value="telegram")
async def connect(self, *, is_reconnect: bool = False) -> bool:
return True
async def disconnect(self) -> None:
pass
async def send(self, *args: Any, **kwargs: Any) -> None:
pass
async def get_chat_info(self, chat_id: str) -> Dict[str, Any]:
return {}
@pytest.fixture()
def adapter():
"""Create a stub adapter with __init__ bypassed."""
obj = _StubAdapter.__new__(_StubAdapter)
obj._running = True
obj._fatal_error_code = None
obj._fatal_error_message = None
obj._fatal_error_retryable = True
obj._fatal_error_handler = None
obj._platform_lock_scope = None
obj._platform_lock_identity = None
obj._status_write_logged = None
return obj
def test_stale_lock_failure_is_retryable(adapter):
"""Lock failure must be retryable, not permanently fatal (#54167)."""
with patch(
"gateway.status.acquire_scoped_lock",
return_value=(False, {"pid": 99999, "start_time": "2026-01-01T00:00:00Z"}),
), patch.object(adapter, "_write_runtime_status_safe"):
result = adapter._acquire_platform_lock(
"telegram-bot-token", "test-token", "Telegram bot token"
)
assert result is False
assert adapter._fatal_error_retryable is True
assert adapter._fatal_error_code == "telegram-bot-token_lock"