mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-08 03:01:47 +00:00
fix(update): reset-failed before every fallback restart so the gateway can't get stranded (#21371)
cmd_update's auto-restart path could leave the gateway dead after a transient failure in systemd's own auto-restart window. Reproduced on Ubuntu 25.10 + systemd 257: after update, gateway drains and exits 75, systemd's first respawn 60s later fails (status=200/CHDIR with "No such file or directory" on a WorkingDirectory that demonstrably exists), the unit ends up in RestartMaxDelaySec=300 backoff, and cmd_update's fallback 'systemctl restart' never recovers it — leaving users with a permanently silent gateway until they manually run 'systemctl reset-failed'. The fix mirrors the recovery pattern 'hermes gateway restart' (systemd_restart) got in PR #20949: always reset-failed before restart, on both the initial fallback and the retry. Also rewrites the final failure message to tell the user to reset-failed + restart (not just restart, which is the step that already failed twice).
This commit is contained in:
parent
04918345ea
commit
1d2029b2b7
2 changed files with 261 additions and 3 deletions
|
|
@ -7735,6 +7735,23 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
|||
# when the graceful path failed (unit missing
|
||||
# SIGUSR1 wiring, drain exceeded the budget,
|
||||
# restart-policy mismatch).
|
||||
#
|
||||
# Always `reset-failed` first. If systemd's own
|
||||
# auto-restart attempts already parked the unit
|
||||
# in a failed state (transient CHDIR / OOM /
|
||||
# filesystem race after our drain + exit-75),
|
||||
# a plain `systemctl restart` can wedge against
|
||||
# the RestartSec backoff and leave the unit
|
||||
# dead. Clearing the failed state first makes
|
||||
# the restart idempotent. Mirrors the recovery
|
||||
# path in `hermes gateway restart`
|
||||
# (`systemd_restart()`) as of PR #20949.
|
||||
subprocess.run(
|
||||
scope_cmd + ["reset-failed", svc_name],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
restart = subprocess.run(
|
||||
scope_cmd + ["restart", svc_name],
|
||||
capture_output=True,
|
||||
|
|
@ -7754,10 +7771,19 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
|||
else:
|
||||
# Retry once — transient startup failures
|
||||
# (stale module cache, import race) often
|
||||
# resolve on the second attempt.
|
||||
# resolve on the second attempt. Again
|
||||
# clear any failed state first so the
|
||||
# retry isn't blocked by the previous
|
||||
# crash.
|
||||
print(
|
||||
f" ⚠ {svc_name} died after restart, retrying..."
|
||||
)
|
||||
subprocess.run(
|
||||
scope_cmd + ["reset-failed", svc_name],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
subprocess.run(
|
||||
scope_cmd + ["restart", svc_name],
|
||||
capture_output=True,
|
||||
|
|
@ -7772,10 +7798,13 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
|||
restarted_services.append(svc_name)
|
||||
print(f" ✓ {svc_name} recovered on retry")
|
||||
else:
|
||||
_scope_flag = "--user " if scope == "user" else ""
|
||||
print(
|
||||
f" ✗ {svc_name} failed to stay running after restart.\n"
|
||||
f" Check logs: journalctl --user -u {svc_name} --since '2 min ago'\n"
|
||||
f" Restart manually: systemctl {'--user ' if scope == 'user' else ''}restart {svc_name}"
|
||||
f" Check logs: journalctl {_scope_flag}-u {svc_name} --since '2 min ago'\n"
|
||||
f" Recover manually:\n"
|
||||
f" systemctl {_scope_flag}reset-failed {svc_name}\n"
|
||||
f" systemctl {_scope_flag}restart {svc_name}"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue