fix(agent,gateway): surface partial-stream recovery and bound detached restart

Salvage of NousResearch/hermes-agent#41498 (0-CYBERDYNE-SYSTEMS-0).

- Leave response_previewed false on partial_stream_recovery so gateway
  fallback delivery can send the recovered fragment plus explanation.
- Always append the turn-completion explainer for partial_stream_recovery,
  not only for empty or very short fragments (#34452 gap).
- Launch the detached /restart helper before drain, idempotently, with a
  bounded wait of restart_drain_timeout + 5s.
This commit is contained in:
infinitycrew39 2026-06-27 22:42:22 +07:00 committed by Teknium
parent e3c9924b8b
commit e860a40e14
3 changed files with 30 additions and 6 deletions

View file

@ -4392,7 +4392,11 @@ def run_conversation(
"as final response"
)
final_response = _recovered
agent._response_was_previewed = True
# Streaming delivered a fragment, not a confirmed
# final preview. Leave response_previewed false so
# gateway fallback delivery can send the recovered
# text plus the abnormal-turn explanation.
agent._response_was_previewed = False
break
# If the previous turn already delivered real content alongside

View file

@ -289,7 +289,14 @@ def finalize_turn(
and len(_stripped) <= 24
and _stripped[-1:] not in {".", "!", "?", "", "", "", "`", ")"}
)
if _is_empty_terminal or _is_partial_fragment:
_is_partial_stream_recovery = (
str(_turn_exit_reason) == "partial_stream_recovery"
)
if (
_is_empty_terminal
or _is_partial_fragment
or _is_partial_stream_recovery
):
_explanation = agent._format_turn_completion_explanation(
_turn_exit_reason
)

View file

@ -2527,6 +2527,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
_restart_task_started: bool = False
_restart_detached: bool = False
_restart_via_service: bool = False
_detached_restart_helper_started: bool = False
_restart_command_source: Optional[SessionSource] = None
_stop_task: Optional[asyncio.Task] = None
_restart_task: Optional[asyncio.Task] = None
@ -2620,6 +2621,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
self._restart_task_started = False
self._restart_detached = False
self._restart_via_service = False
self._detached_restart_helper_started = False
self._restart_command_source: Optional[SessionSource] = None
self._stop_task: Optional[asyncio.Task] = None
self._restart_task: Optional[asyncio.Task] = None
@ -5360,8 +5362,12 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
if not hermes_cmd:
logger.error("Could not locate hermes binary for detached /restart")
return
if self._detached_restart_helper_started:
return
self._detached_restart_helper_started = True
current_pid = os.getpid()
restart_after_s = max(float(getattr(self, "_restart_drain_timeout", 0.0) or 0.0) + 5.0, 5.0)
# On Windows there's no bash/setsid chain — spawn a tiny Python
# watcher directly via sys.executable instead. The watcher polls
@ -5378,8 +5384,9 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
import os, subprocess, sys, time
from hermes_cli._subprocess_compat import windows_detach_flags_without_breakaway
pid = int(sys.argv[1])
cmd = sys.argv[2:]
deadline = time.monotonic() + 120
restart_after_s = float(sys.argv[2])
cmd = sys.argv[3:]
deadline = time.monotonic() + restart_after_s
def _alive(p):
# On Windows, os.kill(pid, 0) is NOT a no-op — it maps to
@ -5435,7 +5442,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
pythonpath.append(watcher_env["PYTHONPATH"])
watcher_env["PYTHONPATH"] = os.pathsep.join(dict.fromkeys(pythonpath))
subprocess.Popen(
[sys.executable, "-c", watcher, str(current_pid), *cmd_argv],
[sys.executable, "-c", watcher, str(current_pid), str(restart_after_s), *cmd_argv],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
env=watcher_env,
@ -5445,7 +5452,8 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
cmd = " ".join(shlex.quote(part) for part in hermes_cmd)
shell_cmd = (
f"while kill -0 {current_pid} 2>/dev/null; do sleep 0.2; done; "
f"deadline=$(( $(date +%s) + {int(restart_after_s)} )); "
f"while kill -0 {current_pid} 2>/dev/null && [ $(date +%s) -lt $deadline ]; do sleep 0.2; done; "
f"{cmd} gateway restart"
)
# Same marker scrub as the Windows watcher above: this watcher runs
@ -5559,6 +5567,11 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
self._restart_task_started = True
async def _run_restart() -> None:
if detached:
try:
await self._launch_detached_restart_command()
except Exception as e:
logger.error("Failed to launch detached gateway restart helper: %s", e)
await asyncio.sleep(0.05)
await self.stop(restart=True, detached_restart=detached, service_restart=via_service)