mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-05 02:31:47 +00:00
fix: gateway systemd unit now retries indefinitely with backoff (#18639)
The old defaults (StartLimitIntervalSec=600, StartLimitBurst=5, RestartSec=30) meant any network outage over ~5 minutes would permanently kill the gateway until manual intervention. Changes: - StartLimitIntervalSec=0 (never give up) - Restart=always (not just on-failure) - RestartSec=60 with RestartMaxDelaySec=300, RestartSteps=5 (exponential backoff: 60 → 120 → 180 → 240 → 300s cap) - After=network-online.target + Wants= (both units now wait for actual connectivity, not just network.target) Power outage → internet down → internet back = auto-recovery.
This commit is contained in:
parent
585d6778da
commit
f98b5d00a4
1 changed files with 14 additions and 11 deletions
|
|
@ -188,7 +188,7 @@ def _graceful_restart_via_sigusr1(pid: int, drain_timeout: float) -> bool:
|
||||||
|
|
||||||
SIGUSR1 is wired in gateway/run.py to ``request_restart(via_service=True)``
|
SIGUSR1 is wired in gateway/run.py to ``request_restart(via_service=True)``
|
||||||
which drains in-flight agent runs (up to ``agent.restart_drain_timeout``
|
which drains in-flight agent runs (up to ``agent.restart_drain_timeout``
|
||||||
seconds), then exits with code 75. Both systemd (``Restart=on-failure``
|
seconds), then exits with code 75. Both systemd (``Restart=always``
|
||||||
+ ``RestartForceExitStatus=75``) and launchd (``KeepAlive.SuccessfulExit
|
+ ``RestartForceExitStatus=75``) and launchd (``KeepAlive.SuccessfulExit
|
||||||
= false``) relaunch the process after the graceful exit.
|
= false``) relaunch the process after the graceful exit.
|
||||||
|
|
||||||
|
|
@ -1655,8 +1655,7 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None)
|
||||||
Description={SERVICE_DESCRIPTION}
|
Description={SERVICE_DESCRIPTION}
|
||||||
After=network-online.target
|
After=network-online.target
|
||||||
Wants=network-online.target
|
Wants=network-online.target
|
||||||
StartLimitIntervalSec=600
|
StartLimitIntervalSec=0
|
||||||
StartLimitBurst=5
|
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
|
|
@ -1670,8 +1669,10 @@ Environment="LOGNAME={username}"
|
||||||
Environment="PATH={sane_path}"
|
Environment="PATH={sane_path}"
|
||||||
Environment="VIRTUAL_ENV={venv_dir}"
|
Environment="VIRTUAL_ENV={venv_dir}"
|
||||||
Environment="HERMES_HOME={hermes_home}"
|
Environment="HERMES_HOME={hermes_home}"
|
||||||
Restart=on-failure
|
Restart=always
|
||||||
RestartSec=30
|
RestartSec=60
|
||||||
|
RestartMaxDelaySec=300
|
||||||
|
RestartSteps=5
|
||||||
RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}
|
RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}
|
||||||
KillMode=mixed
|
KillMode=mixed
|
||||||
KillSignal=SIGTERM
|
KillSignal=SIGTERM
|
||||||
|
|
@ -1691,9 +1692,9 @@ WantedBy=multi-user.target
|
||||||
sane_path = ":".join(path_entries)
|
sane_path = ":".join(path_entries)
|
||||||
return f"""[Unit]
|
return f"""[Unit]
|
||||||
Description={SERVICE_DESCRIPTION}
|
Description={SERVICE_DESCRIPTION}
|
||||||
After=network.target
|
After=network-online.target
|
||||||
StartLimitIntervalSec=600
|
Wants=network-online.target
|
||||||
StartLimitBurst=5
|
StartLimitIntervalSec=0
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
|
|
@ -1702,8 +1703,10 @@ WorkingDirectory={working_dir}
|
||||||
Environment="PATH={sane_path}"
|
Environment="PATH={sane_path}"
|
||||||
Environment="VIRTUAL_ENV={venv_dir}"
|
Environment="VIRTUAL_ENV={venv_dir}"
|
||||||
Environment="HERMES_HOME={hermes_home}"
|
Environment="HERMES_HOME={hermes_home}"
|
||||||
Restart=on-failure
|
Restart=always
|
||||||
RestartSec=30
|
RestartSec=60
|
||||||
|
RestartMaxDelaySec=300
|
||||||
|
RestartSteps=5
|
||||||
RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}
|
RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}
|
||||||
KillMode=mixed
|
KillMode=mixed
|
||||||
KillSignal=SIGTERM
|
KillSignal=SIGTERM
|
||||||
|
|
@ -2451,7 +2454,7 @@ def run_gateway(verbose: int = 0, quiet: bool = False, replace: bool = False):
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# Exit with code 1 if gateway fails to connect any platform,
|
# Exit with code 1 if gateway fails to connect any platform,
|
||||||
# so systemd Restart=on-failure will retry on transient errors
|
# so systemd Restart=always will retry on transient errors
|
||||||
verbosity = None if quiet else verbose
|
verbosity = None if quiet else verbose
|
||||||
try:
|
try:
|
||||||
success = asyncio.run(start_gateway(replace=replace, verbosity=verbosity))
|
success = asyncio.run(start_gateway(replace=replace, verbosity=verbosity))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue