mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-13 03:52:00 +00:00
feat(gateway): auto-resume interrupted sessions after restart
This commit is contained in:
parent
233bfd3621
commit
fad684b1f3
3 changed files with 190 additions and 1 deletions
|
|
@ -2739,6 +2739,57 @@ class GatewayRunner:
|
||||||
task.add_done_callback(self._background_tasks.discard)
|
task.add_done_callback(self._background_tasks.discard)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def _schedule_resume_pending_sessions(self) -> int:
|
||||||
|
"""Auto-continue fresh restart-interrupted sessions after startup.
|
||||||
|
|
||||||
|
``resume_pending`` already preserves the transcript and injects the
|
||||||
|
recovery system note on the next user message. This method closes the
|
||||||
|
restart UX gap by synthesizing that next message once adapters are back
|
||||||
|
online, so users do not have to send a placeholder ping after restart.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
entries = self.session_store.list_resume_pending(
|
||||||
|
window_secs=_auto_continue_freshness_window(),
|
||||||
|
allowed_reasons={"restart_timeout", "shutdown_timeout"},
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to list resume-pending sessions: %s", exc)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
scheduled = 0
|
||||||
|
for entry in entries:
|
||||||
|
source = getattr(entry, "origin", None)
|
||||||
|
platform = getattr(source, "platform", None)
|
||||||
|
adapter = self.adapters.get(platform) if platform is not None else None
|
||||||
|
if source is None or adapter is None:
|
||||||
|
logger.debug(
|
||||||
|
"Skipping auto-resume for %s: adapter unavailable for %s",
|
||||||
|
getattr(entry, "session_key", "?"),
|
||||||
|
getattr(platform, "value", platform),
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
event = MessageEvent(
|
||||||
|
text=(
|
||||||
|
"[System note: The gateway restarted after interrupting "
|
||||||
|
"this session. Resume the previous turn now. Reconcile "
|
||||||
|
"the transcript first: if tool results are already present, "
|
||||||
|
"process them before taking new action; never claim work "
|
||||||
|
"completed unless it is visible in the transcript/tool output.]"
|
||||||
|
),
|
||||||
|
message_type=MessageType.TEXT,
|
||||||
|
source=source,
|
||||||
|
internal=True,
|
||||||
|
)
|
||||||
|
task = asyncio.create_task(adapter.handle_message(event))
|
||||||
|
self._background_tasks.add(task)
|
||||||
|
task.add_done_callback(self._background_tasks.discard)
|
||||||
|
scheduled += 1
|
||||||
|
|
||||||
|
if scheduled:
|
||||||
|
logger.info("Scheduled auto-resume for %d restart-interrupted session(s)", scheduled)
|
||||||
|
return scheduled
|
||||||
|
|
||||||
async def start(self) -> bool:
|
async def start(self) -> bool:
|
||||||
"""
|
"""
|
||||||
Start the gateway and all configured platform adapters.
|
Start the gateway and all configured platform adapters.
|
||||||
|
|
@ -3127,6 +3178,12 @@ class GatewayRunner:
|
||||||
skip_targets=skip_home_targets,
|
skip_targets=skip_home_targets,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Automatically continue fresh sessions that were interrupted by the
|
||||||
|
# previous gateway restart/shutdown. The resume_pending flag is cleared
|
||||||
|
# by the normal successful-turn path, so a failed auto-resume remains
|
||||||
|
# visible for manual recovery on the next user message.
|
||||||
|
self._schedule_resume_pending_sessions()
|
||||||
|
|
||||||
# Drain any recovered process watchers (from crash recovery checkpoint)
|
# Drain any recovered process watchers (from crash recovery checkpoint)
|
||||||
try:
|
try:
|
||||||
from tools.process_registry import process_registry
|
from tools.process_registry import process_registry
|
||||||
|
|
|
||||||
|
|
@ -1028,6 +1028,42 @@ class SessionStore:
|
||||||
self._save()
|
self._save()
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def list_resume_pending(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
window_secs: Optional[float] = None,
|
||||||
|
now: Optional[float] = None,
|
||||||
|
allowed_reasons: Optional[set[str]] = None,
|
||||||
|
) -> List[SessionEntry]:
|
||||||
|
"""Return fresh restart-interrupted sessions eligible for resume.
|
||||||
|
|
||||||
|
Only entries that still have an origin are returned; the gateway needs
|
||||||
|
that origin to route continuation back through the original
|
||||||
|
platform/chat/thread. ``suspended`` entries are excluded because
|
||||||
|
explicit suspension/stuck-loop escalation must win over resume.
|
||||||
|
"""
|
||||||
|
current = datetime.fromtimestamp(now) if now is not None else _now()
|
||||||
|
window = float(window_secs) if window_secs is not None else None
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
self._ensure_loaded_locked()
|
||||||
|
entries = list(self._entries.values())
|
||||||
|
|
||||||
|
pending: List[SessionEntry] = []
|
||||||
|
for entry in entries:
|
||||||
|
if not entry.resume_pending or entry.suspended or entry.origin is None:
|
||||||
|
continue
|
||||||
|
if allowed_reasons is not None and entry.resume_reason not in allowed_reasons:
|
||||||
|
continue
|
||||||
|
if window is not None and window > 0:
|
||||||
|
marker = entry.last_resume_marked_at or entry.updated_at
|
||||||
|
if marker is not None and (current - marker).total_seconds() > window:
|
||||||
|
continue
|
||||||
|
pending.append(entry)
|
||||||
|
|
||||||
|
pending.sort(key=lambda entry: entry.last_resume_marked_at or entry.updated_at)
|
||||||
|
return pending
|
||||||
|
|
||||||
def prune_old_entries(self, max_age_days: int) -> int:
|
def prune_old_entries(self, max_age_days: int) -> int:
|
||||||
"""Drop SessionEntry records older than max_age_days.
|
"""Drop SessionEntry records older than max_age_days.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -33,7 +33,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from gateway.config import GatewayConfig, HomeChannel, Platform, PlatformConfig
|
from gateway.config import GatewayConfig, HomeChannel, Platform, PlatformConfig
|
||||||
from gateway.platforms.base import SendResult
|
from gateway.platforms.base import MessageEvent, MessageType, SendResult
|
||||||
from gateway.run import (
|
from gateway.run import (
|
||||||
_auto_continue_freshness_window,
|
_auto_continue_freshness_window,
|
||||||
_coerce_gateway_timestamp,
|
_coerce_gateway_timestamp,
|
||||||
|
|
@ -227,6 +227,30 @@ class TestSessionEntryResumeFields:
|
||||||
|
|
||||||
|
|
||||||
class TestMarkResumePending:
|
class TestMarkResumePending:
|
||||||
|
def test_list_resume_pending_returns_fresh_entries_with_origins(self, tmp_path):
|
||||||
|
store = _make_store(tmp_path)
|
||||||
|
fresh = store.get_or_create_session(_make_source(chat_id="fresh"))
|
||||||
|
stale = store.get_or_create_session(_make_source(chat_id="stale"))
|
||||||
|
missing_origin = store.get_or_create_session(_make_source(chat_id="missing-origin"))
|
||||||
|
suspended = store.get_or_create_session(_make_source(chat_id="suspended"))
|
||||||
|
|
||||||
|
store.mark_resume_pending(fresh.session_key, reason="restart_timeout")
|
||||||
|
store.mark_resume_pending(stale.session_key, reason="restart_timeout")
|
||||||
|
store.mark_resume_pending(missing_origin.session_key, reason="restart_timeout")
|
||||||
|
store.mark_resume_pending(suspended.session_key, reason="restart_timeout")
|
||||||
|
old = datetime.now() - timedelta(hours=3)
|
||||||
|
store._entries[stale.session_key].last_resume_marked_at = old
|
||||||
|
store._entries[missing_origin.session_key].origin = None
|
||||||
|
store._entries[suspended.session_key].suspended = True
|
||||||
|
|
||||||
|
pending = store.list_resume_pending(
|
||||||
|
window_secs=3600,
|
||||||
|
now=datetime.now().timestamp(),
|
||||||
|
allowed_reasons={"restart_timeout"},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert [entry.session_key for entry in pending] == [fresh.session_key]
|
||||||
|
|
||||||
def test_marks_existing_session(self, tmp_path):
|
def test_marks_existing_session(self, tmp_path):
|
||||||
store = _make_store(tmp_path)
|
store = _make_store(tmp_path)
|
||||||
source = _make_source()
|
source = _make_source()
|
||||||
|
|
@ -910,6 +934,78 @@ async def test_drain_timeout_skips_pending_sentinel_sessions():
|
||||||
assert marked == {session_key_real}
|
assert marked == {session_key_real}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Gateway startup auto-resume
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_startup_auto_resume_schedules_fresh_pending_sessions():
|
||||||
|
"""Fresh resume_pending sessions should continue automatically after startup.
|
||||||
|
|
||||||
|
This closes the UX gap where restart recovery only happened if the user sent
|
||||||
|
another message after the gateway came back.
|
||||||
|
"""
|
||||||
|
runner, adapter = make_restart_runner()
|
||||||
|
source = make_restart_source(chat_id="resume-chat", thread_id="topic-1")
|
||||||
|
pending_entry = SessionEntry(
|
||||||
|
session_key="agent:main:telegram:group:resume-chat:topic-1",
|
||||||
|
session_id="sid",
|
||||||
|
created_at=datetime.now(),
|
||||||
|
updated_at=datetime.now(),
|
||||||
|
origin=source,
|
||||||
|
platform=Platform.TELEGRAM,
|
||||||
|
chat_type="group",
|
||||||
|
resume_pending=True,
|
||||||
|
resume_reason="restart_timeout",
|
||||||
|
last_resume_marked_at=datetime.now(),
|
||||||
|
)
|
||||||
|
runner.session_store.list_resume_pending = MagicMock(return_value=[pending_entry])
|
||||||
|
adapter.handle_message = AsyncMock()
|
||||||
|
|
||||||
|
scheduled = runner._schedule_resume_pending_sessions()
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
|
||||||
|
assert scheduled == 1
|
||||||
|
runner.session_store.list_resume_pending.assert_called_once_with(
|
||||||
|
window_secs=_auto_continue_freshness_window(),
|
||||||
|
allowed_reasons={"restart_timeout", "shutdown_timeout"},
|
||||||
|
)
|
||||||
|
adapter.handle_message.assert_awaited_once()
|
||||||
|
event = adapter.handle_message.await_args.args[0]
|
||||||
|
assert isinstance(event, MessageEvent)
|
||||||
|
assert event.internal is True
|
||||||
|
assert event.message_type == MessageType.TEXT
|
||||||
|
assert event.source == source
|
||||||
|
assert event.text.startswith("[System note: The gateway restarted")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_startup_auto_resume_skips_when_adapter_unavailable():
|
||||||
|
runner, adapter = make_restart_runner()
|
||||||
|
source = make_restart_source(chat_id="resume-chat")
|
||||||
|
pending_entry = SessionEntry(
|
||||||
|
session_key="agent:main:telegram:dm:resume-chat",
|
||||||
|
session_id="sid",
|
||||||
|
created_at=datetime.now(),
|
||||||
|
updated_at=datetime.now(),
|
||||||
|
origin=source,
|
||||||
|
platform=Platform.TELEGRAM,
|
||||||
|
chat_type="dm",
|
||||||
|
resume_pending=True,
|
||||||
|
resume_reason="restart_timeout",
|
||||||
|
last_resume_marked_at=datetime.now(),
|
||||||
|
)
|
||||||
|
runner.session_store.list_resume_pending = MagicMock(return_value=[pending_entry])
|
||||||
|
runner.adapters = {}
|
||||||
|
adapter.handle_message = AsyncMock()
|
||||||
|
|
||||||
|
scheduled = runner._schedule_resume_pending_sessions()
|
||||||
|
|
||||||
|
assert scheduled == 0
|
||||||
|
adapter.handle_message.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Shutdown banner wording
|
# Shutdown banner wording
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue