mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
A WebUI/TUI session whose last turn died mid-tool-loop (stale-timeout kill, interrupt, or process restart before the tool result was written) persists a dangling assistant(tool_calls) or interrupted assistant->tool tail. The messaging gateway already strips these tails before replay (the #49201 fix), but the TUI/WebUI resume path fed db.get_messages_as_conversation() straight in as the agent's conversation_history with no cleanup. The model re-issued the unanswered call on every resume -- including after a full WebUI + Gateway restart, since the poison lives in the SessionDB, not memory -- leaving the session permanently 'thinking'. Only deleting the session recovered it. - Extract the two strippers + helper from gateway/run.py into a shared agent/replay_cleanup.py (sanitize_replay_history wraps both). - gateway/run.py re-exports under the historical private names; messaging behavior unchanged. - Both TUI cold-resume sites now sanitize the model-fed history while leaving the display transcript untouched, so the user still sees their full history. Verified E2E against a real SessionDB: dangling and interrupted tails are stripped from the model feed, healthy mid-progress tool sequences are preserved, and the display transcript is always the full raw history.
140 lines
5.5 KiB
Python
140 lines
5.5 KiB
Python
"""Replay-history sanitization shared across resume code paths.
|
||
|
||
When a session's last turn dies mid-tool-loop — the process is killed by a
|
||
restart/shutdown command, a stale-timeout fires, or an interrupt lands before
|
||
the tool result is written — the persisted transcript can end with a dangling
|
||
``assistant(tool_calls)`` (no matching ``tool`` answer) or an interrupted
|
||
``assistant→tool`` block. On resume the model sees that broken tail and
|
||
re-issues the unanswered call, producing an endless "thinking"/reboot loop
|
||
(#49201, #29086).
|
||
|
||
These pure helpers strip those tails before the history is replayed to the
|
||
model. They were originally local to ``gateway/run.py`` (which fixed the
|
||
messaging-gateway path) and are extracted here so every resume surface — the
|
||
messaging gateway AND the TUI/WebUI gateway — shares the same cleanup instead
|
||
of the WebUI path silently skipping it.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
from typing import Any, Dict, List
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def is_interrupted_tool_result(content: Any) -> bool:
|
||
"""Return True if a tool result indicates the tool was interrupted."""
|
||
if not isinstance(content, str):
|
||
return False
|
||
lowered = content.lower()
|
||
if "[command interrupted]" in lowered:
|
||
return True
|
||
if "exit_code" in lowered and ("130" in lowered or "-1" in lowered):
|
||
return "interrupt" in lowered
|
||
return False
|
||
|
||
|
||
def strip_interrupted_tool_tails(
|
||
agent_history: List[Dict[str, Any]],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Strip interrupted assistant→tool sequences from replay history.
|
||
|
||
Older interrupted gateway turns can be followed by a queued real user
|
||
message, so the interrupted assistant/tool block is not necessarily the
|
||
final tail by the time we rebuild replay history. Remove any contiguous
|
||
assistant(tool_calls) + tool-result block that contains an interrupted tool
|
||
result, while preserving successful tool-call sequences intact.
|
||
"""
|
||
if not agent_history:
|
||
return agent_history
|
||
|
||
cleaned: List[Dict[str, Any]] = []
|
||
i = 0
|
||
n = len(agent_history)
|
||
while i < n:
|
||
msg = agent_history[i]
|
||
if msg.get("role") == "assistant" and "tool_calls" in msg:
|
||
j = i + 1
|
||
tool_results: List[Dict[str, Any]] = []
|
||
while j < n and agent_history[j].get("role") == "tool":
|
||
tool_results.append(agent_history[j])
|
||
j += 1
|
||
if tool_results and any(
|
||
is_interrupted_tool_result(m.get("content", ""))
|
||
for m in tool_results
|
||
):
|
||
logger.debug(
|
||
"Stripping interrupted assistant→tool replay block "
|
||
"(indices %d–%d, tool_results=%d)",
|
||
i, j - 1, len(tool_results),
|
||
)
|
||
i = j
|
||
continue
|
||
if msg.get("role") == "tool" and is_interrupted_tool_result(msg.get("content", "")):
|
||
logger.debug("Stripping orphan interrupted tool result from replay history")
|
||
i += 1
|
||
continue
|
||
cleaned.append(msg)
|
||
i += 1
|
||
|
||
return cleaned
|
||
|
||
|
||
def strip_dangling_tool_call_tail(
|
||
agent_history: List[Dict[str, Any]],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Strip a trailing ``assistant(tool_calls)`` block left with NO answers.
|
||
|
||
When a tool call itself kills the gateway process (``docker restart``,
|
||
``systemctl restart``, ``kill``, ``hermes gateway restart``), the process
|
||
is terminated by SIGKILL *mid-call* — before the tool result is ever
|
||
written and before the orderly shutdown rewind
|
||
(``_drop_trailing_empty_response_scaffolding``) can run. The last thing
|
||
persisted is the ``assistant`` message that issued the ``tool_calls``,
|
||
with zero matching ``tool`` rows.
|
||
|
||
On resume the model sees an unanswered tool call at the tail and naturally
|
||
re-issues it — which restarts the gateway again, producing the infinite
|
||
reboot loop in #49201. ``strip_interrupted_tool_tails`` does not catch
|
||
this because there is no tool result to inspect for an interrupt marker.
|
||
|
||
This strips that dangling tail at the source so there is nothing for the
|
||
model to re-execute. It only acts when the tail is an
|
||
``assistant(tool_calls)`` whose calls have NO corresponding ``tool``
|
||
results — a completed assistant→tool pair (any tool answers present) is
|
||
left untouched so genuine mid-progress tool loops still resume.
|
||
"""
|
||
if not agent_history:
|
||
return agent_history
|
||
|
||
last = agent_history[-1]
|
||
if not (
|
||
isinstance(last, dict)
|
||
and last.get("role") == "assistant"
|
||
and last.get("tool_calls")
|
||
):
|
||
return agent_history
|
||
|
||
logger.debug(
|
||
"Stripping dangling unanswered assistant(tool_calls) tail "
|
||
"(%d call(s)) — process likely killed mid-tool-call by a "
|
||
"restart/shutdown command (#49201)",
|
||
len(last.get("tool_calls") or []),
|
||
)
|
||
return agent_history[:-1]
|
||
|
||
|
||
def sanitize_replay_history(
|
||
agent_history: List[Dict[str, Any]],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Apply both replay-tail strippers in the canonical order.
|
||
|
||
Convenience entry point for resume code paths: removes interrupted
|
||
assistant→tool blocks anywhere in the history, then removes a dangling
|
||
unanswered ``assistant(tool_calls)`` tail. Returns the same list object
|
||
when there is nothing to strip.
|
||
"""
|
||
if not agent_history:
|
||
return agent_history
|
||
return strip_dangling_tool_call_tail(strip_interrupted_tool_tails(agent_history))
|