From dccf1fb6e0eacca33a3c46f44bccde35f9fa2880 Mon Sep 17 00:00:00 2001 From: LeonSGP43 Date: Tue, 5 May 2026 10:18:06 +0800 Subject: [PATCH] fix(gateway): cap adapter disconnect during stop --- gateway/run.py | 28 ++++++++++++++++++- hermes_cli/gateway.py | 10 ++++++- tests/gateway/test_safe_adapter_disconnect.py | 20 +++++++++++++ tests/hermes_cli/test_gateway_service.py | 23 +++++++++++++++ 4 files changed, 79 insertions(+), 2 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 13d57c46d0..457dc60d75 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -61,6 +61,7 @@ from hermes_cli.config import cfg_get _AGENT_CACHE_MAX_SIZE = 128 _AGENT_CACHE_IDLE_TTL_SECS = 3600.0 # evict agents idle for >1h _PLATFORM_CONNECT_TIMEOUT_SECS_DEFAULT = 30.0 +_ADAPTER_DISCONNECT_TIMEOUT_SECS_DEFAULT = 5.0 _TELEGRAM_COMMAND_MENTION_RE = re.compile(r"(? float: + """Return the per-adapter disconnect timeout used during shutdown.""" + raw = os.getenv("HERMES_GATEWAY_ADAPTER_DISCONNECT_TIMEOUT", "").strip() + if raw: + try: + timeout = float(raw) + except ValueError: + logger.warning( + "Ignoring invalid HERMES_GATEWAY_ADAPTER_DISCONNECT_TIMEOUT=%r", + raw, + ) + else: + return max(0.0, timeout) + return _ADAPTER_DISCONNECT_TIMEOUT_SECS_DEFAULT + def _platform_connect_timeout_secs(self) -> float: """Return the per-platform connect timeout used during startup/retry.""" raw = os.getenv("HERMES_GATEWAY_PLATFORM_CONNECT_TIMEOUT", "").strip() diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index ecd7fa2e0c..adee8cd44b 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -2387,7 +2387,15 @@ def systemd_stop(system: bool = False): write_planned_stop_marker(pid) except Exception: pass - _run_systemctl(["stop", get_service_name()], system=system, check=True, timeout=90) + try: + _run_systemctl(["stop", get_service_name()], system=system, check=True, timeout=90) + except subprocess.TimeoutExpired: + label = _service_scope_label(system) + print( + f"Gateway {label} service is still stopping after 90s; " + "check `hermes gateway status` or logs for final shutdown state." + ) + return print(f"✓ {_service_scope_label(system).capitalize()} service stopped") diff --git a/tests/gateway/test_safe_adapter_disconnect.py b/tests/gateway/test_safe_adapter_disconnect.py index ec11f2663a..9a17aa0476 100644 --- a/tests/gateway/test_safe_adapter_disconnect.py +++ b/tests/gateway/test_safe_adapter_disconnect.py @@ -10,6 +10,8 @@ The fix: gateway/run.py wraps each adapter connect() with a safety-net call to _safe_adapter_disconnect() in the failure branches. """ +import asyncio +import logging from unittest.mock import AsyncMock, MagicMock import pytest @@ -57,3 +59,21 @@ async def test_safe_disconnect_handles_none_platform(bare_runner): await bare_runner._safe_adapter_disconnect(adapter, None) adapter.disconnect.assert_awaited_once() + + +@pytest.mark.asyncio +async def test_safe_disconnect_times_out_and_continues(bare_runner, monkeypatch, caplog): + """A wedged adapter disconnect must not block gateway shutdown.""" + monkeypatch.setenv("HERMES_GATEWAY_ADAPTER_DISCONNECT_TIMEOUT", "0.001") + adapter = MagicMock() + + async def hang(): + await asyncio.sleep(60) + + adapter.disconnect = AsyncMock(side_effect=hang) + + with caplog.at_level(logging.WARNING, logger="gateway.run"): + await bare_runner._safe_adapter_disconnect(adapter, Platform.FEISHU) + + adapter.disconnect.assert_awaited_once() + assert "Timed out after 0.0s while disconnecting feishu adapter" in caplog.text diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py index 15968f798e..3b68476fbd 100644 --- a/tests/hermes_cli/test_gateway_service.py +++ b/tests/hermes_cli/test_gateway_service.py @@ -140,6 +140,29 @@ class TestSystemdServiceRefresh: assert markers == [321] assert calls == [["stop", gateway_cli.get_service_name()]] + def test_systemd_stop_timeout_prints_status_guidance(self, monkeypatch, capsys): + markers = [] + + monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False) + monkeypatch.setattr(gateway_cli, "_require_service_installed", lambda action, system=False: None) + monkeypatch.setattr(status, "get_running_pid", lambda cleanup_stale=True: 321) + monkeypatch.setattr( + status, + "write_planned_stop_marker", + lambda pid: markers.append(pid) or True, + ) + + def fake_run_systemctl(args, **kwargs): + raise subprocess.TimeoutExpired(args, kwargs.get("timeout")) + + monkeypatch.setattr(gateway_cli, "_run_systemctl", fake_run_systemctl) + + gateway_cli.systemd_stop() + + assert markers == [321] + output = capsys.readouterr().out + assert "still stopping after 90s" in output + assert "hermes gateway status" in output def test_run_gateway_refreshes_outdated_unit_on_boot(self, tmp_path, monkeypatch): """run_gateway() should refresh the systemd unit on boot so that