From e860a40e14b1e936ab2e5ad646546380ad0a4e57 Mon Sep 17 00:00:00 2001 From: infinitycrew39 Date: Sat, 27 Jun 2026 22:42:22 +0700 Subject: [PATCH] fix(agent,gateway): surface partial-stream recovery and bound detached restart Salvage of NousResearch/hermes-agent#41498 (0-CYBERDYNE-SYSTEMS-0). - Leave response_previewed false on partial_stream_recovery so gateway fallback delivery can send the recovered fragment plus explanation. - Always append the turn-completion explainer for partial_stream_recovery, not only for empty or very short fragments (#34452 gap). - Launch the detached /restart helper before drain, idempotently, with a bounded wait of restart_drain_timeout + 5s. --- agent/conversation_loop.py | 6 +++++- agent/turn_finalizer.py | 9 ++++++++- gateway/run.py | 21 +++++++++++++++++---- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index cb71e724159..e482c3a6444 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -4392,7 +4392,11 @@ def run_conversation( "as final response" ) final_response = _recovered - agent._response_was_previewed = True + # Streaming delivered a fragment, not a confirmed + # final preview. Leave response_previewed false so + # gateway fallback delivery can send the recovered + # text plus the abnormal-turn explanation. + agent._response_was_previewed = False break # If the previous turn already delivered real content alongside diff --git a/agent/turn_finalizer.py b/agent/turn_finalizer.py index c51c18f93cc..f09cc26c07f 100644 --- a/agent/turn_finalizer.py +++ b/agent/turn_finalizer.py @@ -289,7 +289,14 @@ def finalize_turn( and len(_stripped) <= 24 and _stripped[-1:] not in {".", "!", "?", "。", "!", "?", "`", ")"} ) - if _is_empty_terminal or _is_partial_fragment: + _is_partial_stream_recovery = ( + str(_turn_exit_reason) == "partial_stream_recovery" + ) + if ( + _is_empty_terminal + or _is_partial_fragment + or _is_partial_stream_recovery + ): _explanation = agent._format_turn_completion_explanation( _turn_exit_reason ) diff --git a/gateway/run.py b/gateway/run.py index bff09b8d646..f8e8232f36c 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2527,6 +2527,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew _restart_task_started: bool = False _restart_detached: bool = False _restart_via_service: bool = False + _detached_restart_helper_started: bool = False _restart_command_source: Optional[SessionSource] = None _stop_task: Optional[asyncio.Task] = None _restart_task: Optional[asyncio.Task] = None @@ -2620,6 +2621,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew self._restart_task_started = False self._restart_detached = False self._restart_via_service = False + self._detached_restart_helper_started = False self._restart_command_source: Optional[SessionSource] = None self._stop_task: Optional[asyncio.Task] = None self._restart_task: Optional[asyncio.Task] = None @@ -5360,8 +5362,12 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew if not hermes_cmd: logger.error("Could not locate hermes binary for detached /restart") return + if self._detached_restart_helper_started: + return + self._detached_restart_helper_started = True current_pid = os.getpid() + restart_after_s = max(float(getattr(self, "_restart_drain_timeout", 0.0) or 0.0) + 5.0, 5.0) # On Windows there's no bash/setsid chain — spawn a tiny Python # watcher directly via sys.executable instead. The watcher polls @@ -5378,8 +5384,9 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew import os, subprocess, sys, time from hermes_cli._subprocess_compat import windows_detach_flags_without_breakaway pid = int(sys.argv[1]) - cmd = sys.argv[2:] - deadline = time.monotonic() + 120 + restart_after_s = float(sys.argv[2]) + cmd = sys.argv[3:] + deadline = time.monotonic() + restart_after_s def _alive(p): # On Windows, os.kill(pid, 0) is NOT a no-op — it maps to @@ -5435,7 +5442,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew pythonpath.append(watcher_env["PYTHONPATH"]) watcher_env["PYTHONPATH"] = os.pathsep.join(dict.fromkeys(pythonpath)) subprocess.Popen( - [sys.executable, "-c", watcher, str(current_pid), *cmd_argv], + [sys.executable, "-c", watcher, str(current_pid), str(restart_after_s), *cmd_argv], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=watcher_env, @@ -5445,7 +5452,8 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew cmd = " ".join(shlex.quote(part) for part in hermes_cmd) shell_cmd = ( - f"while kill -0 {current_pid} 2>/dev/null; do sleep 0.2; done; " + f"deadline=$(( $(date +%s) + {int(restart_after_s)} )); " + f"while kill -0 {current_pid} 2>/dev/null && [ $(date +%s) -lt $deadline ]; do sleep 0.2; done; " f"{cmd} gateway restart" ) # Same marker scrub as the Windows watcher above: this watcher runs @@ -5559,6 +5567,11 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew self._restart_task_started = True async def _run_restart() -> None: + if detached: + try: + await self._launch_detached_restart_command() + except Exception as e: + logger.error("Failed to launch detached gateway restart helper: %s", e) await asyncio.sleep(0.05) await self.stop(restart=True, detached_restart=detached, service_restart=via_service)