mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-11 08:42:11 +00:00
fix: dedupe concurrent gateway restarts + surface restart outcome in onboarding UI
Follow-ups to the salvaged Telegram QR onboarding auto-restart: - _spawn_gateway_restart() reuses a live in-flight 'hermes gateway restart' child instead of spawning a second racing one (stale cached frontend + new backend both requesting a restart, or restart-button double-click). Both /api/gateway/restart and the onboarding apply path go through it. - ChannelsPage polls /api/actions/gateway-restart/status after a server-initiated restart and surfaces a non-zero exit (e.g. systemd linger missing) via the manual-restart banner, since restart_started only means the child spawned. - Test for the reuse path + _ACTION_PROCS isolation in existing tests.
This commit is contained in:
parent
984e69ff62
commit
fa32af886f
3 changed files with 112 additions and 2 deletions
|
|
@ -1372,11 +1372,28 @@ def _tail_lines(path: Path, n: int) -> List[str]:
|
|||
return lines[-n:] if n > 0 else lines
|
||||
|
||||
|
||||
def _spawn_gateway_restart() -> Tuple[subprocess.Popen, bool]:
|
||||
"""Spawn ``hermes gateway restart``, reusing an in-flight restart.
|
||||
|
||||
Multiple dashboard paths can request a restart in quick succession
|
||||
(restart button double-click, or a stale cached frontend firing its own
|
||||
restart after the server already auto-restarted post-onboarding). Two
|
||||
concurrent ``hermes gateway restart`` children race each other on the
|
||||
manual kill-and-start path, so reuse the live one instead.
|
||||
|
||||
Returns ``(proc, reused)``.
|
||||
"""
|
||||
existing = _ACTION_PROCS.get("gateway-restart")
|
||||
if existing is not None and existing.poll() is None:
|
||||
return existing, True
|
||||
return _spawn_hermes_action(["gateway", "restart"], "gateway-restart"), False
|
||||
|
||||
|
||||
@app.post("/api/gateway/restart")
|
||||
async def restart_gateway():
|
||||
"""Kick off a ``hermes gateway restart`` in the background."""
|
||||
try:
|
||||
proc = _spawn_hermes_action(["gateway", "restart"], "gateway-restart")
|
||||
proc, _reused = _spawn_gateway_restart()
|
||||
except Exception as exc:
|
||||
_log.exception("Failed to spawn gateway restart")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to restart gateway: {exc}")
|
||||
|
|
@ -3757,13 +3774,18 @@ def _restart_gateway_after_telegram_onboarding() -> dict[str, Any]:
|
|||
restart failures so the UI can fall back to the existing manual banner.
|
||||
"""
|
||||
try:
|
||||
proc = _spawn_hermes_action(["gateway", "restart"], "gateway-restart")
|
||||
proc, reused = _spawn_gateway_restart()
|
||||
except Exception as exc:
|
||||
_log.exception("Failed to auto-restart gateway after Telegram onboarding")
|
||||
return {
|
||||
"restart_started": False,
|
||||
"restart_error": str(exc),
|
||||
}
|
||||
if reused:
|
||||
_log.info(
|
||||
"Telegram onboarding: reusing in-flight gateway restart (pid %s)",
|
||||
proc.pid,
|
||||
)
|
||||
return {
|
||||
"restart_started": True,
|
||||
"restart_action": "gateway-restart",
|
||||
|
|
|
|||
|
|
@ -1399,6 +1399,7 @@ class TestWebServerEndpoints:
|
|||
}
|
||||
|
||||
monkeypatch.setattr(ws, "_telegram_onboarding_request_sync", fake_request)
|
||||
ws._ACTION_PROCS.pop("gateway-restart", None)
|
||||
restart_calls = []
|
||||
|
||||
class FakeRestartProc:
|
||||
|
|
@ -1471,6 +1472,7 @@ class TestWebServerEndpoints:
|
|||
}
|
||||
|
||||
monkeypatch.setattr(ws, "_telegram_onboarding_request_sync", fake_request)
|
||||
ws._ACTION_PROCS.pop("gateway-restart", None)
|
||||
|
||||
def fail_spawn_action(subcommand, name):
|
||||
assert subcommand == ["gateway", "restart"]
|
||||
|
|
@ -1502,6 +1504,65 @@ class TestWebServerEndpoints:
|
|||
assert env["TELEGRAM_ALLOWED_USERS"] == "123456789"
|
||||
assert load_config()["platforms"]["telegram"]["enabled"] is True
|
||||
|
||||
def test_telegram_onboarding_apply_reuses_inflight_gateway_restart(
|
||||
self, monkeypatch
|
||||
):
|
||||
"""A live in-flight gateway restart is reused instead of spawning a
|
||||
second racing ``hermes gateway restart`` child (e.g. when a stale
|
||||
cached frontend also fires its own restart call)."""
|
||||
import hermes_cli.web_server as ws
|
||||
|
||||
with ws._telegram_onboarding_lock:
|
||||
ws._telegram_onboarding_pairings.clear()
|
||||
|
||||
def fake_request(method, path, *, body=None, bearer_token=None):
|
||||
if method == "POST":
|
||||
return {
|
||||
"pairing_id": "pair-reuse",
|
||||
"poll_token": "poll-secret",
|
||||
"suggested_username": "hermes_pair_reuse_bot",
|
||||
"deep_link": "https://t.me/newbot/HermesSetupBot/hermes_pair_reuse_bot",
|
||||
"qr_payload": "https://t.me/newbot/HermesSetupBot/hermes_pair_reuse_bot",
|
||||
"expires_at": "2027-05-18T00:00:00.000Z",
|
||||
}
|
||||
return {
|
||||
"status": "ready",
|
||||
"bot_username": "hermes_pair_reuse_bot",
|
||||
"owner_user_id": 123456789,
|
||||
"token": "123456:SECRET",
|
||||
}
|
||||
|
||||
monkeypatch.setattr(ws, "_telegram_onboarding_request_sync", fake_request)
|
||||
|
||||
class FakeRunningProc:
|
||||
pid = 5151
|
||||
|
||||
def poll(self):
|
||||
return None # still running
|
||||
|
||||
monkeypatch.setitem(ws._ACTION_PROCS, "gateway-restart", FakeRunningProc())
|
||||
|
||||
def fail_spawn_action(subcommand, name):
|
||||
raise AssertionError("must not spawn a second concurrent restart")
|
||||
|
||||
monkeypatch.setattr(ws, "_spawn_hermes_action", fail_spawn_action)
|
||||
|
||||
start = self.client.post("/api/messaging/telegram/onboarding/start", json={})
|
||||
assert start.status_code == 200
|
||||
ready = self.client.get("/api/messaging/telegram/onboarding/pair-reuse")
|
||||
assert ready.status_code == 200
|
||||
|
||||
applied = self.client.post(
|
||||
"/api/messaging/telegram/onboarding/pair-reuse/apply",
|
||||
json={"allowed_user_ids": ["123456789"]},
|
||||
)
|
||||
|
||||
assert applied.status_code == 200
|
||||
applied_data = applied.json()
|
||||
assert applied_data["needs_restart"] is False
|
||||
assert applied_data["restart_started"] is True
|
||||
assert applied_data["restart_pid"] == 5151
|
||||
|
||||
def test_telegram_onboarding_apply_requires_ready_pairing(self, monkeypatch):
|
||||
import hermes_cli.web_server as ws
|
||||
|
||||
|
|
|
|||
|
|
@ -599,6 +599,32 @@ function TelegramOnboardingPanel({
|
|||
setNewAllowedId("");
|
||||
};
|
||||
|
||||
// restart_started only means the `hermes gateway restart` child spawned —
|
||||
// not that the restart will succeed (e.g. systemd linger missing, service
|
||||
// manager failure). Poll the action status briefly and surface a non-zero
|
||||
// exit via the manual-restart banner. Note: in no-service installs the
|
||||
// child becomes the foreground gateway and never exits, so "still running
|
||||
// when the window closes" counts as success.
|
||||
const watchRestartOutcome = async () => {
|
||||
for (let i = 0; i < 20; i++) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1500));
|
||||
try {
|
||||
const st = await api.getActionStatus("gateway-restart", 5);
|
||||
if (st.running) continue;
|
||||
if (st.exit_code !== 0 && st.exit_code !== null) {
|
||||
onRestartNeeded();
|
||||
showToast(
|
||||
`Gateway restart failed (exit ${st.exit_code}) — restart manually`,
|
||||
"error",
|
||||
);
|
||||
}
|
||||
return;
|
||||
} catch {
|
||||
// transient fetch error; keep polling
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const apply = async () => {
|
||||
if (!setup) return;
|
||||
if (allowedIds.length === 0) {
|
||||
|
|
@ -616,6 +642,7 @@ function TelegramOnboardingPanel({
|
|||
showToast("Telegram saved; gateway restarting…", "success");
|
||||
setRestartNeeded(false);
|
||||
setTimeout(() => void onChanged(), 4000);
|
||||
void watchRestartOutcome();
|
||||
} else if (result.restart_started === undefined && result.needs_restart) {
|
||||
try {
|
||||
await api.restartGateway();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue