fix(gateway): keep pending /update completion notifications until the target platform reconnects

This commit is contained in:
Frowtek 2026-06-04 01:38:33 +03:00 committed by Teknium
parent a6a0a5b1b0
commit b7169f9bbb
3 changed files with 113 additions and 16 deletions

View file

@ -15142,10 +15142,15 @@ class GatewayRunner:
if not adapter or not chat_id:
logger.warning("Update watcher: cannot resolve adapter/chat_id, falling back to completion-only")
# Fall back to old behavior: wait for exit code and send final notification
# Fall back to completion-only: wait for the exit code and send the
# final notification. _send_update_notification re-resolves the
# adapter on every call, so when the target platform is still
# reconnecting it returns False and keeps the markers. Keep polling
# until it actually delivers (returns True) instead of giving up
# after the first completion check — otherwise a platform that
# reconnects a few seconds after completion never gets notified.
while (pending_path.exists() or claimed_path.exists()) and loop.time() < deadline:
if exit_code_path.exists():
await self._send_update_notification()
if exit_code_path.exists() and await self._send_update_notification():
return
await asyncio.sleep(poll_interval)
if (pending_path.exists() or claimed_path.exists()) and not exit_code_path.exists():
@ -15359,6 +15364,24 @@ class GatewayRunner:
platform = Platform(platform_str)
adapter = self.adapters.get(platform)
if not adapter and chat_id:
# The update finished, but the target platform has not
# reconnected yet (common right after the restart that
# `hermes update` triggers). Treating "adapter missing" as a
# definitive skip would delete the markers and silently lose the
# completion notification — the user never learns whether the
# update succeeded or timed out. Preserve the markers instead so
# a later retry (the watcher poll loop, or the next gateway
# startup) can deliver the result once the adapter is back.
logger.info(
"Update notification deferred: %s adapter not connected yet",
platform_str,
)
cleanup = False
active_pending_path = pending_path
claimed_path.replace(pending_path)
return False
if adapter and chat_id:
metadata = self._thread_metadata_for_target(
platform,

View file

@ -661,8 +661,14 @@ class TestSendUpdateNotification:
assert not pending_path.exists()
@pytest.mark.asyncio
async def test_no_adapter_for_platform(self, tmp_path):
"""Does not crash if the platform adapter is not connected."""
async def test_no_adapter_for_platform_preserves_markers(self, tmp_path):
"""A finished update whose platform is offline keeps its markers.
When the target platform's adapter has not reconnected yet, dropping
the completion markers would silently lose the notification. Instead the
call defers (returns False) and leaves every marker on disk so a later
retry can deliver once the platform is back.
"""
runner = _make_runner()
hermes_home = tmp_path / "hermes"
hermes_home.mkdir()
@ -680,13 +686,62 @@ class TestSendUpdateNotification:
runner.adapters = {Platform.TELEGRAM: mock_adapter}
with patch("gateway.run._hermes_home", hermes_home):
await runner._send_update_notification()
result = await runner._send_update_notification()
# send should not have been called (wrong platform)
# No send (wrong platform offline) and the result is deferred.
assert result is False
mock_adapter.send.assert_not_called()
# Files should still be cleaned up
# Markers are preserved for a later retry — NOT cleaned up.
assert pending_path.exists()
assert output_path.exists()
assert exit_code_path.exists()
# The marker stays in its canonical pending location (claim restored).
assert not (hermes_home / ".update_pending.claimed.json").exists()
@pytest.mark.asyncio
async def test_deferred_notification_delivers_after_reconnect(self, tmp_path):
"""A deferred completion is delivered once the platform reconnects.
Regression for the late-reconnect /update bug: the update finishes while
the target platform is offline, the markers survive the deferral, and
the next call (after the adapter is registered) delivers the result and
cleans up exactly once.
"""
runner = _make_runner()
hermes_home = tmp_path / "hermes"
hermes_home.mkdir()
pending = {"platform": "discord", "chat_id": "111", "user_id": "222"}
pending_path = hermes_home / ".update_pending.json"
output_path = hermes_home / ".update_output.txt"
exit_code_path = hermes_home / ".update_exit_code"
pending_path.write_text(json.dumps(pending))
output_path.write_text("✓ Update complete!")
exit_code_path.write_text("0")
# First pass: target platform (discord) is still offline → defer.
with patch("gateway.run._hermes_home", hermes_home):
first = await runner._send_update_notification()
assert first is False
assert pending_path.exists()
# Platform reconnects: the reconnect watcher adds the adapter back.
mock_adapter = AsyncMock()
runner.adapters = {Platform.DISCORD: mock_adapter}
with patch("gateway.run._hermes_home", hermes_home):
second = await runner._send_update_notification()
assert second is True
mock_adapter.send.assert_called_once()
sent_text = mock_adapter.send.call_args[0][1]
assert "Update complete" in sent_text
# Now everything is cleaned up — no duplicate deliveries possible.
assert not pending_path.exists()
assert not output_path.exists()
assert not exit_code_path.exists()
assert not (hermes_home / ".update_pending.claimed.json").exists()
# ---------------------------------------------------------------------------

View file

@ -439,30 +439,49 @@ class TestWatchUpdateProgress:
assert "failed" in all_sent.lower()
@pytest.mark.asyncio
async def test_falls_back_when_adapter_unavailable(self, tmp_path):
"""Falls back to legacy notification when adapter can't be resolved."""
async def test_falls_back_and_delivers_after_reconnect(self, tmp_path):
"""Completion-only fallback waits for the platform to reconnect.
When the target adapter isn't connected at watcher start, the watcher
must keep the markers and retry until the platform reconnects, then
deliver the completion notification rather than dropping it on the
first completion check (the late-reconnect /update bug).
"""
runner = _make_runner()
hermes_home = tmp_path / "hermes"
hermes_home.mkdir()
# Platform doesn't match any adapter
# Target platform (discord) isn't connected yet; the update is finished.
pending = {"platform": "discord", "chat_id": "111", "user_id": "222"}
(hermes_home / ".update_pending.json").write_text(json.dumps(pending))
pending_path = hermes_home / ".update_pending.json"
pending_path.write_text(json.dumps(pending))
(hermes_home / ".update_output.txt").write_text("done\n")
(hermes_home / ".update_exit_code").write_text("0")
# Only telegram adapter available
mock_adapter = AsyncMock()
runner.adapters = {Platform.TELEGRAM: mock_adapter}
# Only telegram is connected at first.
runner.adapters = {Platform.TELEGRAM: AsyncMock()}
discord_adapter = AsyncMock()
async def reconnect_discord():
# The platform reconnect watcher registers discord mid-poll.
await asyncio.sleep(0.3)
runner.adapters[Platform.DISCORD] = discord_adapter
with patch("gateway.run._hermes_home", hermes_home):
task = asyncio.create_task(reconnect_discord())
await runner._watch_update_progress(
poll_interval=0.1,
stream_interval=0.2,
timeout=5.0,
)
await task
# Should not crash; legacy notification handles this case
# The completion was delivered to discord once it reconnected...
discord_adapter.send.assert_called_once()
# ...and the markers are cleaned up after successful delivery.
assert not pending_path.exists()
assert not (hermes_home / ".update_exit_code").exists()
@pytest.mark.asyncio
async def test_prompt_forwarded_only_once(self, tmp_path):