fix(gateway): guard yaml.safe_load and float() env var casts against crash

Two defensive fixes in gateway/run.py:

1. yaml.safe_load returning None on empty config files (line 12706):
   GatewayConfig.from_dict(data) crashes with AttributeError when the YAML
   file is empty because safe_load returns None. All 6 other yaml.safe_load
   call sites already use `or {}` — this one was missed.
   Impact: gateway fails to start with empty --config file.

2. float() on env vars without ValueError guard (lines 3951, 11757, 11805,
   11807): HERMES_AGENT_TIMEOUT, HERMES_AGENT_TIMEOUT_WARNING, and
   HERMES_AGENT_NOTIFY_INTERVAL are cast via float() directly from
   os.getenv(). A typo (e.g. "abc") raises ValueError and crashes the
   agent turn or gateway startup.
   Impact: single misconfigured env var crashes the entire gateway.
This commit is contained in:
vominh1919 2026-04-30 16:08:17 +07:00 committed by Teknium
parent 5af8fa5c8c
commit ca87c822ed

View file

@ -3948,7 +3948,10 @@ class GatewayRunner:
# wall-clock age alone isn't sufficient. Evict only when the agent
# has been *idle* beyond the inactivity threshold (or when the agent
# object has no activity tracker and wall-clock age is extreme).
_raw_stale_timeout = float(os.getenv("HERMES_AGENT_TIMEOUT", 1800))
try:
_raw_stale_timeout = float(os.getenv("HERMES_AGENT_TIMEOUT", 1800))
except (ValueError, TypeError):
_raw_stale_timeout = 1800.0
_stale_ts = self._running_agents_ts.get(_quick_key, 0)
if _quick_key in self._running_agents and _stale_ts:
_stale_age = time.time() - _stale_ts
@ -11755,7 +11758,10 @@ class GatewayRunner:
# Config: agent.gateway_notify_interval in config.yaml, or
# HERMES_AGENT_NOTIFY_INTERVAL env var. Default 180s (3 min).
# 0 = disable notifications.
_NOTIFY_INTERVAL_RAW = float(os.getenv("HERMES_AGENT_NOTIFY_INTERVAL", 180))
try:
_NOTIFY_INTERVAL_RAW = float(os.getenv("HERMES_AGENT_NOTIFY_INTERVAL", 180))
except (ValueError, TypeError):
_NOTIFY_INTERVAL_RAW = 180.0
_NOTIFY_INTERVAL = _NOTIFY_INTERVAL_RAW if _NOTIFY_INTERVAL_RAW > 0 else None
_notify_start = time.time()
@ -11803,9 +11809,15 @@ class GatewayRunner:
# Config: agent.gateway_timeout in config.yaml, or
# HERMES_AGENT_TIMEOUT env var (env var takes precedence).
# Default 1800s (30 min inactivity). 0 = unlimited.
_agent_timeout_raw = float(os.getenv("HERMES_AGENT_TIMEOUT", 1800))
try:
_agent_timeout_raw = float(os.getenv("HERMES_AGENT_TIMEOUT", 1800))
except (ValueError, TypeError):
_agent_timeout_raw = 1800.0
_agent_timeout = _agent_timeout_raw if _agent_timeout_raw > 0 else None
_agent_warning_raw = float(os.getenv("HERMES_AGENT_TIMEOUT_WARNING", 900))
try:
_agent_warning_raw = float(os.getenv("HERMES_AGENT_TIMEOUT_WARNING", 900))
except (ValueError, TypeError):
_agent_warning_raw = 900.0
_agent_warning = _agent_warning_raw if _agent_warning_raw > 0 else None
_warning_fired = False
_executor_task = asyncio.ensure_future(
@ -12704,7 +12716,7 @@ def main():
if args.config:
import yaml
with open(args.config, encoding="utf-8") as f:
data = yaml.safe_load(f)
data = yaml.safe_load(f) or {}
config = GatewayConfig.from_dict(data)
# Run the gateway - exit with code 1 if no platforms connected,