diff --git a/gateway/run.py b/gateway/run.py index 31c6807d297..049b07a80bc 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -15142,10 +15142,15 @@ class GatewayRunner: if not adapter or not chat_id: logger.warning("Update watcher: cannot resolve adapter/chat_id, falling back to completion-only") - # Fall back to old behavior: wait for exit code and send final notification + # Fall back to completion-only: wait for the exit code and send the + # final notification. _send_update_notification re-resolves the + # adapter on every call, so when the target platform is still + # reconnecting it returns False and keeps the markers. Keep polling + # until it actually delivers (returns True) instead of giving up + # after the first completion check — otherwise a platform that + # reconnects a few seconds after completion never gets notified. while (pending_path.exists() or claimed_path.exists()) and loop.time() < deadline: - if exit_code_path.exists(): - await self._send_update_notification() + if exit_code_path.exists() and await self._send_update_notification(): return await asyncio.sleep(poll_interval) if (pending_path.exists() or claimed_path.exists()) and not exit_code_path.exists(): @@ -15359,6 +15364,24 @@ class GatewayRunner: platform = Platform(platform_str) adapter = self.adapters.get(platform) + if not adapter and chat_id: + # The update finished, but the target platform has not + # reconnected yet (common right after the restart that + # `hermes update` triggers). Treating "adapter missing" as a + # definitive skip would delete the markers and silently lose the + # completion notification — the user never learns whether the + # update succeeded or timed out. Preserve the markers instead so + # a later retry (the watcher poll loop, or the next gateway + # startup) can deliver the result once the adapter is back. + logger.info( + "Update notification deferred: %s adapter not connected yet", + platform_str, + ) + cleanup = False + active_pending_path = pending_path + claimed_path.replace(pending_path) + return False + if adapter and chat_id: metadata = self._thread_metadata_for_target( platform, diff --git a/tests/gateway/test_update_command.py b/tests/gateway/test_update_command.py index 6ff37c0fbed..64998771d80 100644 --- a/tests/gateway/test_update_command.py +++ b/tests/gateway/test_update_command.py @@ -661,8 +661,14 @@ class TestSendUpdateNotification: assert not pending_path.exists() @pytest.mark.asyncio - async def test_no_adapter_for_platform(self, tmp_path): - """Does not crash if the platform adapter is not connected.""" + async def test_no_adapter_for_platform_preserves_markers(self, tmp_path): + """A finished update whose platform is offline keeps its markers. + + When the target platform's adapter has not reconnected yet, dropping + the completion markers would silently lose the notification. Instead the + call defers (returns False) and leaves every marker on disk so a later + retry can deliver once the platform is back. + """ runner = _make_runner() hermes_home = tmp_path / "hermes" hermes_home.mkdir() @@ -680,13 +686,62 @@ class TestSendUpdateNotification: runner.adapters = {Platform.TELEGRAM: mock_adapter} with patch("gateway.run._hermes_home", hermes_home): - await runner._send_update_notification() + result = await runner._send_update_notification() - # send should not have been called (wrong platform) + # No send (wrong platform offline) and the result is deferred. + assert result is False mock_adapter.send.assert_not_called() - # Files should still be cleaned up + # Markers are preserved for a later retry — NOT cleaned up. + assert pending_path.exists() + assert output_path.exists() + assert exit_code_path.exists() + # The marker stays in its canonical pending location (claim restored). + assert not (hermes_home / ".update_pending.claimed.json").exists() + + @pytest.mark.asyncio + async def test_deferred_notification_delivers_after_reconnect(self, tmp_path): + """A deferred completion is delivered once the platform reconnects. + + Regression for the late-reconnect /update bug: the update finishes while + the target platform is offline, the markers survive the deferral, and + the next call (after the adapter is registered) delivers the result and + cleans up — exactly once. + """ + runner = _make_runner() + hermes_home = tmp_path / "hermes" + hermes_home.mkdir() + + pending = {"platform": "discord", "chat_id": "111", "user_id": "222"} + pending_path = hermes_home / ".update_pending.json" + output_path = hermes_home / ".update_output.txt" + exit_code_path = hermes_home / ".update_exit_code" + pending_path.write_text(json.dumps(pending)) + output_path.write_text("✓ Update complete!") + exit_code_path.write_text("0") + + # First pass: target platform (discord) is still offline → defer. + with patch("gateway.run._hermes_home", hermes_home): + first = await runner._send_update_notification() + + assert first is False + assert pending_path.exists() + + # Platform reconnects: the reconnect watcher adds the adapter back. + mock_adapter = AsyncMock() + runner.adapters = {Platform.DISCORD: mock_adapter} + + with patch("gateway.run._hermes_home", hermes_home): + second = await runner._send_update_notification() + + assert second is True + mock_adapter.send.assert_called_once() + sent_text = mock_adapter.send.call_args[0][1] + assert "Update complete" in sent_text + # Now everything is cleaned up — no duplicate deliveries possible. assert not pending_path.exists() + assert not output_path.exists() assert not exit_code_path.exists() + assert not (hermes_home / ".update_pending.claimed.json").exists() # --------------------------------------------------------------------------- diff --git a/tests/gateway/test_update_streaming.py b/tests/gateway/test_update_streaming.py index e15e1c3f1f6..3a48ab9171f 100644 --- a/tests/gateway/test_update_streaming.py +++ b/tests/gateway/test_update_streaming.py @@ -439,30 +439,49 @@ class TestWatchUpdateProgress: assert "failed" in all_sent.lower() @pytest.mark.asyncio - async def test_falls_back_when_adapter_unavailable(self, tmp_path): - """Falls back to legacy notification when adapter can't be resolved.""" + async def test_falls_back_and_delivers_after_reconnect(self, tmp_path): + """Completion-only fallback waits for the platform to reconnect. + + When the target adapter isn't connected at watcher start, the watcher + must keep the markers and retry until the platform reconnects, then + deliver the completion notification — rather than dropping it on the + first completion check (the late-reconnect /update bug). + """ runner = _make_runner() hermes_home = tmp_path / "hermes" hermes_home.mkdir() - # Platform doesn't match any adapter + # Target platform (discord) isn't connected yet; the update is finished. pending = {"platform": "discord", "chat_id": "111", "user_id": "222"} - (hermes_home / ".update_pending.json").write_text(json.dumps(pending)) + pending_path = hermes_home / ".update_pending.json" + pending_path.write_text(json.dumps(pending)) (hermes_home / ".update_output.txt").write_text("done\n") (hermes_home / ".update_exit_code").write_text("0") - # Only telegram adapter available - mock_adapter = AsyncMock() - runner.adapters = {Platform.TELEGRAM: mock_adapter} + # Only telegram is connected at first. + runner.adapters = {Platform.TELEGRAM: AsyncMock()} + + discord_adapter = AsyncMock() + + async def reconnect_discord(): + # The platform reconnect watcher registers discord mid-poll. + await asyncio.sleep(0.3) + runner.adapters[Platform.DISCORD] = discord_adapter with patch("gateway.run._hermes_home", hermes_home): + task = asyncio.create_task(reconnect_discord()) await runner._watch_update_progress( poll_interval=0.1, stream_interval=0.2, timeout=5.0, ) + await task - # Should not crash; legacy notification handles this case + # The completion was delivered to discord once it reconnected... + discord_adapter.send.assert_called_once() + # ...and the markers are cleaned up after successful delivery. + assert not pending_path.exists() + assert not (hermes_home / ".update_exit_code").exists() @pytest.mark.asyncio async def test_prompt_forwarded_only_once(self, tmp_path):