mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-14 09:11:54 +00:00
* fix(gateway): auto-start after container restart via planned-stop marker
On Docker (s6-overlay), the gateway runs as a dynamically-registered s6
service. When the container stops/restarts/upgrades, s6 sends the gateway
a plain SIGTERM. The shutdown path (_stop_impl) ended with an
unconditional _update_runtime_status("stopped"), persisting
gateway_state=stopped to the volume. container_boot.py reads that on the
next boot and only auto-starts gateways whose last state was "running"
(_AUTOSTART_STATES) — so after a routine `docker compose up
--force-recreate` the gateway stays down and messaging channels silently
go dark, with no error surfaced (issue #42675).
The codebase already distinguishes intentional stops from unexpected
signals via the planned-stop marker (write_planned_stop_marker /
consume_planned_stop_marker_for_self): `hermes gateway stop`,
systemd/launchd ExecStop, and Ctrl+C write a marker before signalling,
so the handler classifies them as planned. An unmarked SIGTERM
(container/s6 restart, OOM, bare kill) is signal-initiated.
This wires that existing classification through to the state persist,
rather than adding unreliable signal-source inference:
- run.py: GatewayRunner._signal_initiated_shutdown, set in
shutdown_signal_handler's unmarked-signal branch. In _stop_impl, a
signal-initiated (non-restart) teardown now persists "running" instead
of "stopped" — preserving the operator's run-intent and overwriting the
mid-shutdown "draining" marker so _AUTOSTART_STATES matches on reboot.
Operator stops and restarts persist "stopped" as before.
- service_manager.py: S6ServiceManager.stop() now writes the planned-stop
marker for the supervised PID (read from s6-svstat) before `s6-svc -d`,
so an in-container `hermes gateway stop` is correctly classified as
intentional (parity with the systemd/launchd/host stop paths, which
already mark). Best-effort: a marker-write failure falls back to the
safe signal-initiated path.
Tests: shutdown persist-decision table (signal→running, operator→stopped,
restart→stopped), s6 stop marker write + svstat PID parse + failure
tolerance. The signal→running and s6-marker tests fail without the
respective source change. Verified end-to-end against a container built
from this branch: an unmarked SIGTERM to the live gateway leaves
gateway_state=running (shutdown-context log confirms signal path);
existing real container-restart suite still green.
* docs(docker): clarify gateway autostart distinguishes operator-stop from container-kill
The per-profile-supervision section described the autostart-across-restart
contract as "running gateways come back, stopped stay stopped" without
spelling out what records 'stopped'. That contract was the source of
#42675 confusion: users expected a restart to bring the gateway back and
it didn't. With the write-side fix, only an explicit `hermes gateway stop`
records 'stopped'; container/s6 restart SIGTERMs (incl. image upgrades and
unexpected exits) leave the state 'running' so the gateway auto-starts.
Make that distinction explicit in both the multi-profile and
per-profile-supervision sections.
* test(docker): real-restart autostart E2E for #42675
Adds test_live_gateway_autostarts_after_real_restart_without_manual_state_stamp:
a live s6-supervised gateway is killed by an actual `docker restart`
SIGTERM (no manual gateway_state stamp, no planned-stop marker) and must
auto-start on the next boot. Exercises the WRITE side of the fix that the
existing stamp-based tests bypass.
Verified to FAIL against an origin/main image (reconciler logs
prior_state=stopped action=registered — the #42675 bug) and PASS against
the fixed image (prior_state=running action=started).
145 lines
5.4 KiB
Python
145 lines
5.4 KiB
Python
import asyncio
|
|
from collections import OrderedDict
|
|
from unittest.mock import AsyncMock, MagicMock
|
|
|
|
from gateway.config import GatewayConfig, Platform, PlatformConfig
|
|
from gateway.platforms.base import BasePlatformAdapter, SendResult
|
|
from gateway.restart import DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT
|
|
from gateway.run import GatewayRunner
|
|
from gateway.session import SessionSource
|
|
|
|
|
|
class RestartTestAdapter(BasePlatformAdapter):
|
|
def __init__(self):
|
|
super().__init__(PlatformConfig(enabled=True, token="***"), Platform.TELEGRAM)
|
|
self.sent: list[str] = []
|
|
self.sent_calls: list[tuple[str, str, object]] = []
|
|
|
|
async def connect(self):
|
|
return True
|
|
|
|
async def disconnect(self):
|
|
return None
|
|
|
|
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
|
self.sent.append(content)
|
|
self.sent_calls.append((chat_id, content, metadata))
|
|
return SendResult(success=True, message_id="1")
|
|
|
|
async def send_typing(self, chat_id, metadata=None):
|
|
return None
|
|
|
|
async def get_chat_info(self, chat_id):
|
|
return {"id": chat_id}
|
|
|
|
|
|
def make_restart_source(
|
|
chat_id: str = "123456",
|
|
chat_type: str = "dm",
|
|
thread_id: str | None = None,
|
|
) -> SessionSource:
|
|
return SessionSource(
|
|
platform=Platform.TELEGRAM,
|
|
chat_id=chat_id,
|
|
chat_type=chat_type,
|
|
user_id="u1",
|
|
thread_id=thread_id,
|
|
)
|
|
|
|
|
|
def make_restart_runner(
|
|
adapter: BasePlatformAdapter | None = None,
|
|
) -> tuple[GatewayRunner, BasePlatformAdapter]:
|
|
runner = object.__new__(GatewayRunner)
|
|
runner.config = GatewayConfig(
|
|
platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")}
|
|
)
|
|
runner._running = True
|
|
runner._shutdown_event = asyncio.Event()
|
|
runner._exit_reason = None
|
|
runner._exit_code = None
|
|
runner._running_agents = {}
|
|
runner._running_agents_ts = {}
|
|
runner._pending_messages = {}
|
|
runner._pending_approvals = {}
|
|
runner._pending_model_notes = {}
|
|
runner._background_tasks = set()
|
|
runner._draining = False
|
|
runner._restart_requested = False
|
|
runner._signal_initiated_shutdown = False
|
|
runner._restart_task_started = False
|
|
runner._restart_detached = False
|
|
runner._restart_via_service = False
|
|
runner._restart_command_source = None
|
|
runner._restart_drain_timeout = DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT
|
|
runner._stop_task = None
|
|
runner._busy_input_mode = "interrupt"
|
|
runner._update_prompt_pending = {}
|
|
runner._voice_mode = {}
|
|
runner._session_model_overrides = {}
|
|
runner._session_sources = OrderedDict()
|
|
runner._session_sources_max = 512
|
|
runner._shutdown_all_gateway_honcho = lambda: None
|
|
runner._update_runtime_status = MagicMock()
|
|
runner._queue_or_replace_pending_event = GatewayRunner._queue_or_replace_pending_event.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
runner._session_key_for_source = GatewayRunner._session_key_for_source.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
runner._handle_active_session_busy_message = (
|
|
GatewayRunner._handle_active_session_busy_message.__get__(runner, GatewayRunner)
|
|
)
|
|
runner._handle_restart_command = GatewayRunner._handle_restart_command.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
runner._handle_set_home_command = GatewayRunner._handle_set_home_command.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
runner._send_restart_notification = GatewayRunner._send_restart_notification.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
runner._send_home_channel_startup_notifications = (
|
|
GatewayRunner._send_home_channel_startup_notifications.__get__(runner, GatewayRunner)
|
|
)
|
|
runner._status_action_label = GatewayRunner._status_action_label.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
runner._status_action_gerund = GatewayRunner._status_action_gerund.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
runner._queue_during_drain_enabled = GatewayRunner._queue_during_drain_enabled.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
runner._running_agent_count = GatewayRunner._running_agent_count.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
runner._snapshot_running_agents = GatewayRunner._snapshot_running_agents.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
runner._notify_active_sessions_of_shutdown = (
|
|
GatewayRunner._notify_active_sessions_of_shutdown.__get__(runner, GatewayRunner)
|
|
)
|
|
runner._cache_session_source = GatewayRunner._cache_session_source.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
runner._get_cached_session_source = GatewayRunner._get_cached_session_source.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
runner._launch_detached_restart_command = GatewayRunner._launch_detached_restart_command.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
runner.request_restart = GatewayRunner.request_restart.__get__(runner, GatewayRunner)
|
|
runner._is_user_authorized = lambda _source: True
|
|
runner.hooks = MagicMock()
|
|
runner.hooks.emit = AsyncMock()
|
|
runner.pairing_store = MagicMock()
|
|
runner.session_store = MagicMock()
|
|
runner.session_store._entries = {}
|
|
runner.delivery_router = MagicMock()
|
|
|
|
platform_adapter = adapter or RestartTestAdapter()
|
|
platform_adapter.set_message_handler(AsyncMock(return_value=None))
|
|
platform_adapter.set_busy_session_handler(runner._handle_active_session_busy_message)
|
|
runner.adapters = {Platform.TELEGRAM: platform_adapter}
|
|
return runner, platform_adapter
|