diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 50953319a4..af40444922 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -188,7 +188,7 @@ def _graceful_restart_via_sigusr1(pid: int, drain_timeout: float) -> bool: SIGUSR1 is wired in gateway/run.py to ``request_restart(via_service=True)`` which drains in-flight agent runs (up to ``agent.restart_drain_timeout`` - seconds), then exits with code 75. Both systemd (``Restart=on-failure`` + seconds), then exits with code 75. Both systemd (``Restart=always`` + ``RestartForceExitStatus=75``) and launchd (``KeepAlive.SuccessfulExit = false``) relaunch the process after the graceful exit. @@ -1655,8 +1655,7 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None) Description={SERVICE_DESCRIPTION} After=network-online.target Wants=network-online.target -StartLimitIntervalSec=600 -StartLimitBurst=5 +StartLimitIntervalSec=0 [Service] Type=simple @@ -1670,8 +1669,10 @@ Environment="LOGNAME={username}" Environment="PATH={sane_path}" Environment="VIRTUAL_ENV={venv_dir}" Environment="HERMES_HOME={hermes_home}" -Restart=on-failure -RestartSec=30 +Restart=always +RestartSec=60 +RestartMaxDelaySec=300 +RestartSteps=5 RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE} KillMode=mixed KillSignal=SIGTERM @@ -1691,9 +1692,9 @@ WantedBy=multi-user.target sane_path = ":".join(path_entries) return f"""[Unit] Description={SERVICE_DESCRIPTION} -After=network.target -StartLimitIntervalSec=600 -StartLimitBurst=5 +After=network-online.target +Wants=network-online.target +StartLimitIntervalSec=0 [Service] Type=simple @@ -1702,8 +1703,10 @@ WorkingDirectory={working_dir} Environment="PATH={sane_path}" Environment="VIRTUAL_ENV={venv_dir}" Environment="HERMES_HOME={hermes_home}" -Restart=on-failure -RestartSec=30 +Restart=always +RestartSec=60 +RestartMaxDelaySec=300 +RestartSteps=5 RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE} KillMode=mixed KillSignal=SIGTERM @@ -2451,7 +2454,7 @@ def run_gateway(verbose: int = 0, quiet: bool = False, replace: bool = False): print() # Exit with code 1 if gateway fails to connect any platform, - # so systemd Restart=on-failure will retry on transient errors + # so systemd Restart=always will retry on transient errors verbosity = None if quiet else verbose try: success = asyncio.run(start_gateway(replace=replace, verbosity=verbosity))