From 9d919daf446a4d4379a0fd65c72d21aec379ffb2 Mon Sep 17 00:00:00 2001 From: liuhao1024 Date: Sun, 28 Jun 2026 18:06:51 +0800 Subject: [PATCH] fix(gateway): mark platform lock failure as retryable instead of permanently fatal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a stale lock file survives a gateway crash, `acquire_scoped_lock()` may return `(False, existing_dict)` even after detecting and deleting the stale lock (e.g. if unlink fails or a race condition occurs). Previously, `_acquire_platform_lock()` called `_set_fatal_error(..., retryable=False)`, which permanently killed the platform — the reconnect watcher never retries a non-retryable fatal error. Change to `retryable=True` so the platform enters the "retrying" state and the reconnect watcher can attempt acquisition again after the standard backoff delay. Fixes #54167 --- gateway/platforms/base.py | 2 +- .../test_stale_platform_lock_retryable.py | 73 +++++++++++++++++++ 2 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 tests/gateway/test_stale_platform_lock_retryable.py diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index 4066868fa69..5ded48a18ca 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -2592,7 +2592,7 @@ class BasePlatformAdapter(ABC): + '. Stop the other gateway first.' ) logger.error('[%s] %s', self.name, message) - self._set_fatal_error(f'{scope}_lock', message, retryable=False) + self._set_fatal_error(f'{scope}_lock', message, retryable=True) return False def _release_platform_lock(self) -> None: diff --git a/tests/gateway/test_stale_platform_lock_retryable.py b/tests/gateway/test_stale_platform_lock_retryable.py new file mode 100644 index 00000000000..39f1ab2c388 --- /dev/null +++ b/tests/gateway/test_stale_platform_lock_retryable.py @@ -0,0 +1,73 @@ +"""Regression test for #54167 — stale platform lock must be retryable. + +When a gateway process is killed (SIGKILL, crash) during Telegram +initialization, the scoped lock file survives. On next startup, +``acquire_scoped_lock()`` detects the stale lock and deletes it, but may +still return ``(False, existing_dict)`` to the caller (e.g. if the +unlink fails due to permissions, or a race condition lets another +process grab the lock first). + +``_acquire_platform_lock()`` must mark such failures as **retryable** +so the reconnect watcher can retry after a delay — not permanently kill +the platform. + +Contract asserted here +---------------------- +``_set_fatal_error`` is called with ``retryable=True`` when lock +acquisition fails, regardless of the reason. +""" + +from typing import Any, Dict +from unittest.mock import MagicMock, patch + +import pytest + +from gateway.platforms.base import BasePlatformAdapter + + +class _StubAdapter(BasePlatformAdapter): + """Minimal concrete subclass for testing _acquire_platform_lock.""" + + platform = MagicMock(value="telegram") + + async def connect(self, *, is_reconnect: bool = False) -> bool: + return True + + async def disconnect(self) -> None: + pass + + async def send(self, *args: Any, **kwargs: Any) -> None: + pass + + async def get_chat_info(self, chat_id: str) -> Dict[str, Any]: + return {} + + +@pytest.fixture() +def adapter(): + """Create a stub adapter with __init__ bypassed.""" + obj = _StubAdapter.__new__(_StubAdapter) + obj._running = True + obj._fatal_error_code = None + obj._fatal_error_message = None + obj._fatal_error_retryable = True + obj._fatal_error_handler = None + obj._platform_lock_scope = None + obj._platform_lock_identity = None + obj._status_write_logged = None + return obj + + +def test_stale_lock_failure_is_retryable(adapter): + """Lock failure must be retryable, not permanently fatal (#54167).""" + with patch( + "gateway.status.acquire_scoped_lock", + return_value=(False, {"pid": 99999, "start_time": "2026-01-01T00:00:00Z"}), + ), patch.object(adapter, "_write_runtime_status_safe"): + result = adapter._acquire_platform_lock( + "telegram-bot-token", "test-token", "Telegram bot token" + ) + + assert result is False + assert adapter._fatal_error_retryable is True + assert adapter._fatal_error_code == "telegram-bot-token_lock"