mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-30 11:52:04 +00:00
* fix(telegram): clear send_path_degraded on successful reconnect _send_path_degraded was cleared only in _verify_polling_after_reconnect, 60s after reconnect and only if scheduled. A clean start_polling() reconnect left the flag stuck True, short-circuiting send() and blocking all outbound messages until the deferred probe ran (or forever if it never did). Clear the flag the moment start_polling() succeeds — that is the recovery signal. The deferred probe remains a defensive re-check that re-enters the reconnect ladder (re-setting the flag) if it detects a silent wedge. Fixes #35205. * docs: add infographic for #35205 telegram send-path fix
This commit is contained in:
parent
674e16e7c6
commit
2ecb6f7fe6
3 changed files with 64 additions and 6 deletions
BIN
infographic/telegram-send-path-35205/infographic.png
Normal file
BIN
infographic/telegram-send-path-35205/infographic.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.8 MiB |
|
|
@ -1572,6 +1572,17 @@ class TelegramAdapter(BasePlatformAdapter):
|
|||
self.name, attempt,
|
||||
)
|
||||
self._polling_network_error_count = 0
|
||||
# start_polling() succeeding IS the recovery signal: the long-poll
|
||||
# connection is live again, so clear the degraded flag immediately
|
||||
# rather than blocking all outbound sends for the full
|
||||
# HEARTBEAT_PROBE_DELAY window. The deferred probe below is a
|
||||
# defensive re-check — if it later detects a silent wedge (PTB
|
||||
# running=True but consumer task dead) it re-enters the ladder,
|
||||
# which re-sets _send_path_degraded. Without this clear here, a
|
||||
# clean reconnect leaves the flag stuck True until the 60s probe
|
||||
# (or forever, if the probe is never scheduled), blocking the send
|
||||
# path even though the bot has fully recovered. See #35205.
|
||||
self._send_path_degraded = False
|
||||
# start_polling() returning is necessary but not sufficient:
|
||||
# PTB's Updater can be left in a state where `running` is True
|
||||
# but the underlying long-poll task is wedged on a stale httpx
|
||||
|
|
|
|||
|
|
@ -66,8 +66,48 @@ async def test_send_short_circuits_when_path_degraded():
|
|||
|
||||
@pytest.mark.asyncio
|
||||
async def test_reconnect_storm_sets_and_heartbeat_clears_flag(monkeypatch):
|
||||
"""_handle_polling_network_error sets the flag; a successful heartbeat
|
||||
probe in _verify_polling_after_reconnect clears it."""
|
||||
"""_handle_polling_network_error sets the flag while reconnecting; if the
|
||||
reconnect attempt itself raises (polling not yet healthy), the flag stays
|
||||
True until a later successful heartbeat probe in
|
||||
_verify_polling_after_reconnect clears it."""
|
||||
adapter = _make_adapter()
|
||||
adapter._app = MagicMock()
|
||||
adapter._app.updater = MagicMock()
|
||||
adapter._app.updater.running = True
|
||||
adapter._app.updater.stop = AsyncMock()
|
||||
# First start_polling attempt fails — the reconnect handler must leave the
|
||||
# flag set (path still unhealthy) and not clear it prematurely.
|
||||
adapter._app.updater.start_polling = AsyncMock(side_effect=OSError("still down"))
|
||||
adapter._app.bot = MagicMock()
|
||||
adapter._app.bot.get_me = AsyncMock(return_value=MagicMock())
|
||||
adapter._polling_error_callback_ref = AsyncMock()
|
||||
monkeypatch.setattr(
|
||||
"plugins.platforms.telegram.adapter.Update", MagicMock(ALL_TYPES=[])
|
||||
)
|
||||
# Suppress the self-rescheduled retry so the test doesn't recurse.
|
||||
monkeypatch.setattr(
|
||||
"plugins.platforms.telegram.adapter.asyncio.ensure_future", MagicMock()
|
||||
)
|
||||
|
||||
with patch("plugins.platforms.telegram.adapter.asyncio.sleep", new_callable=AsyncMock):
|
||||
await adapter._handle_polling_network_error(OSError("Bad Gateway"))
|
||||
# start_polling failed → path still degraded.
|
||||
assert adapter._send_path_degraded is True
|
||||
|
||||
# Now the deferred probe runs against a recovered (running) updater and
|
||||
# a responsive bot — it clears the flag.
|
||||
adapter._app.updater.running = True
|
||||
with patch("plugins.platforms.telegram.adapter.asyncio.sleep", new_callable=AsyncMock):
|
||||
await adapter._verify_polling_after_reconnect()
|
||||
assert adapter._send_path_degraded is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_successful_reconnect_clears_flag_without_probe(monkeypatch):
|
||||
"""Regression for #35205: a successful start_polling() clears
|
||||
_send_path_degraded immediately, so outbound sends are not blocked for
|
||||
the full HEARTBEAT_PROBE_DELAY window (and never get stuck True if the
|
||||
deferred probe is never scheduled / never runs)."""
|
||||
adapter = _make_adapter()
|
||||
adapter._app = MagicMock()
|
||||
adapter._app.updater = MagicMock()
|
||||
|
|
@ -80,10 +120,17 @@ async def test_reconnect_storm_sets_and_heartbeat_clears_flag(monkeypatch):
|
|||
monkeypatch.setattr(
|
||||
"plugins.platforms.telegram.adapter.Update", MagicMock(ALL_TYPES=[])
|
||||
)
|
||||
|
||||
await adapter._handle_polling_network_error(OSError("Bad Gateway"))
|
||||
assert adapter._send_path_degraded is True
|
||||
# Don't let the deferred probe run — prove the clear happens in the
|
||||
# reconnect handler itself, not in _verify_polling_after_reconnect.
|
||||
monkeypatch.setattr(
|
||||
adapter, "_verify_polling_after_reconnect", AsyncMock()
|
||||
)
|
||||
|
||||
with patch("plugins.platforms.telegram.adapter.asyncio.sleep", new_callable=AsyncMock):
|
||||
await adapter._verify_polling_after_reconnect()
|
||||
await adapter._handle_polling_network_error(OSError("Bad Gateway"))
|
||||
|
||||
assert adapter._send_path_degraded is False
|
||||
assert adapter._polling_network_error_count == 0
|
||||
# And send() works again right away.
|
||||
result = await adapter.send("123", "hello")
|
||||
assert result.success is True
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue