mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-11 03:31:55 +00:00
feat(gateway): auto-resume interrupted sessions after restart
This commit is contained in:
parent
233bfd3621
commit
fad684b1f3
3 changed files with 190 additions and 1 deletions
|
|
@ -2739,6 +2739,57 @@ class GatewayRunner:
|
|||
task.add_done_callback(self._background_tasks.discard)
|
||||
return True
|
||||
|
||||
def _schedule_resume_pending_sessions(self) -> int:
|
||||
"""Auto-continue fresh restart-interrupted sessions after startup.
|
||||
|
||||
``resume_pending`` already preserves the transcript and injects the
|
||||
recovery system note on the next user message. This method closes the
|
||||
restart UX gap by synthesizing that next message once adapters are back
|
||||
online, so users do not have to send a placeholder ping after restart.
|
||||
"""
|
||||
try:
|
||||
entries = self.session_store.list_resume_pending(
|
||||
window_secs=_auto_continue_freshness_window(),
|
||||
allowed_reasons={"restart_timeout", "shutdown_timeout"},
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to list resume-pending sessions: %s", exc)
|
||||
return 0
|
||||
|
||||
scheduled = 0
|
||||
for entry in entries:
|
||||
source = getattr(entry, "origin", None)
|
||||
platform = getattr(source, "platform", None)
|
||||
adapter = self.adapters.get(platform) if platform is not None else None
|
||||
if source is None or adapter is None:
|
||||
logger.debug(
|
||||
"Skipping auto-resume for %s: adapter unavailable for %s",
|
||||
getattr(entry, "session_key", "?"),
|
||||
getattr(platform, "value", platform),
|
||||
)
|
||||
continue
|
||||
|
||||
event = MessageEvent(
|
||||
text=(
|
||||
"[System note: The gateway restarted after interrupting "
|
||||
"this session. Resume the previous turn now. Reconcile "
|
||||
"the transcript first: if tool results are already present, "
|
||||
"process them before taking new action; never claim work "
|
||||
"completed unless it is visible in the transcript/tool output.]"
|
||||
),
|
||||
message_type=MessageType.TEXT,
|
||||
source=source,
|
||||
internal=True,
|
||||
)
|
||||
task = asyncio.create_task(adapter.handle_message(event))
|
||||
self._background_tasks.add(task)
|
||||
task.add_done_callback(self._background_tasks.discard)
|
||||
scheduled += 1
|
||||
|
||||
if scheduled:
|
||||
logger.info("Scheduled auto-resume for %d restart-interrupted session(s)", scheduled)
|
||||
return scheduled
|
||||
|
||||
async def start(self) -> bool:
|
||||
"""
|
||||
Start the gateway and all configured platform adapters.
|
||||
|
|
@ -3127,6 +3178,12 @@ class GatewayRunner:
|
|||
skip_targets=skip_home_targets,
|
||||
)
|
||||
|
||||
# Automatically continue fresh sessions that were interrupted by the
|
||||
# previous gateway restart/shutdown. The resume_pending flag is cleared
|
||||
# by the normal successful-turn path, so a failed auto-resume remains
|
||||
# visible for manual recovery on the next user message.
|
||||
self._schedule_resume_pending_sessions()
|
||||
|
||||
# Drain any recovered process watchers (from crash recovery checkpoint)
|
||||
try:
|
||||
from tools.process_registry import process_registry
|
||||
|
|
|
|||
|
|
@ -1028,6 +1028,42 @@ class SessionStore:
|
|||
self._save()
|
||||
return True
|
||||
|
||||
def list_resume_pending(
|
||||
self,
|
||||
*,
|
||||
window_secs: Optional[float] = None,
|
||||
now: Optional[float] = None,
|
||||
allowed_reasons: Optional[set[str]] = None,
|
||||
) -> List[SessionEntry]:
|
||||
"""Return fresh restart-interrupted sessions eligible for resume.
|
||||
|
||||
Only entries that still have an origin are returned; the gateway needs
|
||||
that origin to route continuation back through the original
|
||||
platform/chat/thread. ``suspended`` entries are excluded because
|
||||
explicit suspension/stuck-loop escalation must win over resume.
|
||||
"""
|
||||
current = datetime.fromtimestamp(now) if now is not None else _now()
|
||||
window = float(window_secs) if window_secs is not None else None
|
||||
|
||||
with self._lock:
|
||||
self._ensure_loaded_locked()
|
||||
entries = list(self._entries.values())
|
||||
|
||||
pending: List[SessionEntry] = []
|
||||
for entry in entries:
|
||||
if not entry.resume_pending or entry.suspended or entry.origin is None:
|
||||
continue
|
||||
if allowed_reasons is not None and entry.resume_reason not in allowed_reasons:
|
||||
continue
|
||||
if window is not None and window > 0:
|
||||
marker = entry.last_resume_marked_at or entry.updated_at
|
||||
if marker is not None and (current - marker).total_seconds() > window:
|
||||
continue
|
||||
pending.append(entry)
|
||||
|
||||
pending.sort(key=lambda entry: entry.last_resume_marked_at or entry.updated_at)
|
||||
return pending
|
||||
|
||||
def prune_old_entries(self, max_age_days: int) -> int:
|
||||
"""Drop SessionEntry records older than max_age_days.
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue