fix(gateway): fix Feishu reconnect message drops and shutdown hang

This commit fixes two critical bugs in the Feishu adapter that affect
message reliability and process lifecycle.

**Bug Fix 1: Intermittent Message Drops**

Root cause: Event handler was created once in __init__ and reused across
reconnects, causing callbacks to capture stale loop references. When the
adapter disconnected and reconnected, old callbacks continued firing with
invalid loop references, resulting in dropped messages with warnings:
"[Feishu] Dropping inbound message before adapter loop is ready"

Fix:
- Rebuild event handler on each connect (websocket/webhook)
- Clear handler on disconnect
- Ensure callbacks always capture current valid loop
- Add defensive loop.is_closed() checks with getattr for test compatibility
- Unify webhook dispatch path to use same loop checks as websocket mode

**Bug Fix 2: Process Hangs on Ctrl+C / SIGTERM**

Root cause: Feishu SDK's websocket client runs in a background thread with
an infinite _select() loop that never exits naturally. The thread was never
properly joined on disconnect, causing processes to hang indefinitely after
Ctrl+C or gateway stop commands.

Fix:
- Store reference to thread-local event loop (_ws_thread_loop)
- On disconnect, cancel all tasks in thread loop and stop it gracefully
  via call_soon_threadsafe()
- Await thread future with 10s timeout
- Clean up pending tasks in thread's finally block before closing loop
- Add detailed debug logging for disconnect flow

**Additional Improvements:**
- Add regression tests for disconnect cleanup and webhook dispatch
- Ensure all event callbacks check loop readiness before dispatching

Tested on Linux with websocket mode. All Feishu tests pass.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
jtuki 2026-04-05 21:56:58 +08:00 committed by Teknium
parent abd24d381b
commit 7cf4bd06bf
2 changed files with 134 additions and 16 deletions

View file

@ -8,7 +8,7 @@ import time
import unittest
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import AsyncMock, patch
from unittest.mock import AsyncMock, Mock, patch
try:
import lark_oapi
@ -289,7 +289,7 @@ class TestFeishuAdapterMessaging(unittest.TestCase):
patch("gateway.platforms.feishu.FEISHU_AVAILABLE", True),
patch("gateway.platforms.feishu.FEISHU_WEBSOCKET_AVAILABLE", True),
patch("gateway.platforms.feishu.lark", SimpleNamespace(LogLevel=SimpleNamespace(INFO="INFO", WARNING="WARNING"))),
patch("gateway.platforms.feishu.EventDispatcherHandler", object()),
patch("gateway.platforms.feishu.EventDispatcherHandler") as mock_handler_class,
patch("gateway.platforms.feishu.FeishuWSClient", return_value=ws_client),
patch("gateway.platforms.feishu._run_official_feishu_ws_client"),
patch("gateway.platforms.feishu.acquire_scoped_lock", return_value=(True, None)) as acquire_lock,
@ -297,6 +297,15 @@ class TestFeishuAdapterMessaging(unittest.TestCase):
patch.object(adapter, "_hydrate_bot_identity", new=AsyncMock()),
patch.object(adapter, "_build_lark_client", return_value=SimpleNamespace()),
):
mock_builder = Mock()
mock_builder.register_p2_im_message_message_read_v1 = Mock(return_value=mock_builder)
mock_builder.register_p2_im_message_receive_v1 = Mock(return_value=mock_builder)
mock_builder.register_p2_im_message_reaction_created_v1 = Mock(return_value=mock_builder)
mock_builder.register_p2_im_message_reaction_deleted_v1 = Mock(return_value=mock_builder)
mock_builder.register_p2_card_action_trigger = Mock(return_value=mock_builder)
mock_builder.build = Mock(return_value=object())
mock_handler_class.builder = Mock(return_value=mock_builder)
loop = asyncio.new_event_loop()
future = loop.create_future()
future.set_result(None)
@ -305,6 +314,9 @@ class TestFeishuAdapterMessaging(unittest.TestCase):
def run_in_executor(self, *_args, **_kwargs):
return future
def is_closed(self):
return False
try:
with patch("gateway.platforms.feishu.asyncio.get_running_loop", return_value=_Loop()):
connected = asyncio.run(adapter.connect())
@ -313,6 +325,7 @@ class TestFeishuAdapterMessaging(unittest.TestCase):
loop.close()
self.assertTrue(connected)
self.assertIsNone(adapter._event_handler)
acquire_lock.assert_called_once_with(
"feishu-app-id",
"cli_app",
@ -361,7 +374,7 @@ class TestFeishuAdapterMessaging(unittest.TestCase):
patch("gateway.platforms.feishu.FEISHU_AVAILABLE", True),
patch("gateway.platforms.feishu.FEISHU_WEBSOCKET_AVAILABLE", True),
patch("gateway.platforms.feishu.lark", SimpleNamespace(LogLevel=SimpleNamespace(INFO="INFO", WARNING="WARNING"))),
patch("gateway.platforms.feishu.EventDispatcherHandler", object()),
patch("gateway.platforms.feishu.EventDispatcherHandler") as mock_handler_class,
patch("gateway.platforms.feishu.FeishuWSClient", return_value=ws_client),
patch("gateway.platforms.feishu.acquire_scoped_lock", return_value=(True, None)),
patch("gateway.platforms.feishu.release_scoped_lock"),
@ -369,6 +382,15 @@ class TestFeishuAdapterMessaging(unittest.TestCase):
patch("gateway.platforms.feishu.asyncio.sleep", side_effect=lambda delay: sleeps.append(delay)),
patch.object(adapter, "_build_lark_client", return_value=SimpleNamespace()),
):
mock_builder = Mock()
mock_builder.register_p2_im_message_message_read_v1 = Mock(return_value=mock_builder)
mock_builder.register_p2_im_message_receive_v1 = Mock(return_value=mock_builder)
mock_builder.register_p2_im_message_reaction_created_v1 = Mock(return_value=mock_builder)
mock_builder.register_p2_im_message_reaction_deleted_v1 = Mock(return_value=mock_builder)
mock_builder.register_p2_card_action_trigger = Mock(return_value=mock_builder)
mock_builder.build = Mock(return_value=object())
mock_handler_class.builder = Mock(return_value=mock_builder)
loop = asyncio.new_event_loop()
future = loop.create_future()
future.set_result(None)
@ -383,6 +405,9 @@ class TestFeishuAdapterMessaging(unittest.TestCase):
raise OSError("temporary websocket failure")
return future
def is_closed(self):
return False
fake_loop = _Loop()
try:
with patch("gateway.platforms.feishu.asyncio.get_running_loop", return_value=fake_loop):
@ -1196,7 +1221,12 @@ class TestAdapterBehavior(unittest.TestCase):
from gateway.platforms.feishu import FeishuAdapter
adapter = FeishuAdapter(PlatformConfig())
adapter._loop = object()
class _Loop:
def is_closed(self):
return False
adapter._loop = _Loop()
message = SimpleNamespace(
message_id="om_text",
@ -1210,6 +1240,7 @@ class TestAdapterBehavior(unittest.TestCase):
data = SimpleNamespace(event=SimpleNamespace(message=message, sender=sender))
future = SimpleNamespace(add_done_callback=lambda *_args, **_kwargs: None)
def _submit(coro, _loop):
coro.close()
return future
@ -1219,6 +1250,30 @@ class TestAdapterBehavior(unittest.TestCase):
self.assertTrue(submit.called)
@patch.dict(os.environ, {}, clear=True)
def test_webhook_request_uses_same_message_dispatch_path(self):
from gateway.config import PlatformConfig
from gateway.platforms.feishu import FeishuAdapter
adapter = FeishuAdapter(PlatformConfig())
adapter._on_message_event = Mock()
body = json.dumps({
"header": {"event_type": "im.message.receive_v1"},
"event": {"message": {"message_id": "om_test"}},
}).encode("utf-8")
request = SimpleNamespace(
remote="127.0.0.1",
content_length=None,
headers={},
read=AsyncMock(return_value=body),
)
response = asyncio.run(adapter._handle_webhook_request(request))
self.assertEqual(response.status, 200)
adapter._on_message_event.assert_called_once()
@patch.dict(os.environ, {}, clear=True)
def test_process_inbound_message_uses_event_sender_identity_only(self):
from gateway.config import PlatformConfig