fix: prevent systemd restart storm on gateway connection failure

Cherry-picked from PR #2319 by @itenev.

When the gateway fails to connect (e.g. PrivilegedIntentsRequired,
missing token), systemd's default RestartSec=10 with no start rate
limit causes rapid reconnect storms flooding logs and triggering
platform-side rate limits.

- StartLimitIntervalSec=600 + StartLimitBurst=5 in [Unit] (max 5
  restarts per 10 min)
- RestartSec: 10 → 30
- Applied to both templates in gateway.py and scripts/hermes-gateway
This commit is contained in:
Teknium 2026-03-21 09:26:39 -07:00
parent 3ba6043c62
commit 326b146d68
No known key found for this signature in database
2 changed files with 9 additions and 3 deletions

View file

@ -420,6 +420,8 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None)
Description={SERVICE_DESCRIPTION} Description={SERVICE_DESCRIPTION}
After=network-online.target After=network-online.target
Wants=network-online.target Wants=network-online.target
StartLimitIntervalSec=600
StartLimitBurst=5
[Service] [Service]
Type=simple Type=simple
@ -434,7 +436,7 @@ Environment="PATH={sane_path}"
Environment="VIRTUAL_ENV={venv_dir}" Environment="VIRTUAL_ENV={venv_dir}"
Environment="HERMES_HOME={hermes_home}" Environment="HERMES_HOME={hermes_home}"
Restart=on-failure Restart=on-failure
RestartSec=10 RestartSec=30
KillMode=mixed KillMode=mixed
KillSignal=SIGTERM KillSignal=SIGTERM
TimeoutStopSec=60 TimeoutStopSec=60
@ -448,6 +450,8 @@ WantedBy=multi-user.target
return f"""[Unit] return f"""[Unit]
Description={SERVICE_DESCRIPTION} Description={SERVICE_DESCRIPTION}
After=network.target After=network.target
StartLimitIntervalSec=600
StartLimitBurst=5
[Service] [Service]
Type=simple Type=simple
@ -457,7 +461,7 @@ Environment="PATH={sane_path}"
Environment="VIRTUAL_ENV={venv_dir}" Environment="VIRTUAL_ENV={venv_dir}"
Environment="HERMES_HOME={hermes_home}" Environment="HERMES_HOME={hermes_home}"
Restart=on-failure Restart=on-failure
RestartSec=10 RestartSec=30
KillMode=mixed KillMode=mixed
KillSignal=SIGTERM KillSignal=SIGTERM
TimeoutStopSec=60 TimeoutStopSec=60

View file

@ -82,13 +82,15 @@ def generate_systemd_unit() -> str:
return f"""[Unit] return f"""[Unit]
Description={SERVICE_DESCRIPTION} Description={SERVICE_DESCRIPTION}
After=network.target After=network.target
StartLimitIntervalSec=600
StartLimitBurst=5
[Service] [Service]
Type=simple Type=simple
ExecStart={python_path} {script_path} run ExecStart={python_path} {script_path} run
WorkingDirectory={working_dir} WorkingDirectory={working_dir}
Restart=on-failure Restart=on-failure
RestartSec=10 RestartSec=30
StandardOutput=journal StandardOutput=journal
StandardError=journal StandardError=journal