From ca87c822ede2574e0f8b32ca66e0f886d7f0a770 Mon Sep 17 00:00:00 2001 From: vominh1919 Date: Thu, 30 Apr 2026 16:08:17 +0700 Subject: [PATCH] fix(gateway): guard yaml.safe_load and float() env var casts against crash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two defensive fixes in gateway/run.py: 1. yaml.safe_load returning None on empty config files (line 12706): GatewayConfig.from_dict(data) crashes with AttributeError when the YAML file is empty because safe_load returns None. All 6 other yaml.safe_load call sites already use `or {}` — this one was missed. Impact: gateway fails to start with empty --config file. 2. float() on env vars without ValueError guard (lines 3951, 11757, 11805, 11807): HERMES_AGENT_TIMEOUT, HERMES_AGENT_TIMEOUT_WARNING, and HERMES_AGENT_NOTIFY_INTERVAL are cast via float() directly from os.getenv(). A typo (e.g. "abc") raises ValueError and crashes the agent turn or gateway startup. Impact: single misconfigured env var crashes the entire gateway. --- gateway/run.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 19dc5eae74a..8c6c4a51c42 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -3948,7 +3948,10 @@ class GatewayRunner: # wall-clock age alone isn't sufficient. Evict only when the agent # has been *idle* beyond the inactivity threshold (or when the agent # object has no activity tracker and wall-clock age is extreme). - _raw_stale_timeout = float(os.getenv("HERMES_AGENT_TIMEOUT", 1800)) + try: + _raw_stale_timeout = float(os.getenv("HERMES_AGENT_TIMEOUT", 1800)) + except (ValueError, TypeError): + _raw_stale_timeout = 1800.0 _stale_ts = self._running_agents_ts.get(_quick_key, 0) if _quick_key in self._running_agents and _stale_ts: _stale_age = time.time() - _stale_ts @@ -11755,7 +11758,10 @@ class GatewayRunner: # Config: agent.gateway_notify_interval in config.yaml, or # HERMES_AGENT_NOTIFY_INTERVAL env var. Default 180s (3 min). # 0 = disable notifications. - _NOTIFY_INTERVAL_RAW = float(os.getenv("HERMES_AGENT_NOTIFY_INTERVAL", 180)) + try: + _NOTIFY_INTERVAL_RAW = float(os.getenv("HERMES_AGENT_NOTIFY_INTERVAL", 180)) + except (ValueError, TypeError): + _NOTIFY_INTERVAL_RAW = 180.0 _NOTIFY_INTERVAL = _NOTIFY_INTERVAL_RAW if _NOTIFY_INTERVAL_RAW > 0 else None _notify_start = time.time() @@ -11803,9 +11809,15 @@ class GatewayRunner: # Config: agent.gateway_timeout in config.yaml, or # HERMES_AGENT_TIMEOUT env var (env var takes precedence). # Default 1800s (30 min inactivity). 0 = unlimited. - _agent_timeout_raw = float(os.getenv("HERMES_AGENT_TIMEOUT", 1800)) + try: + _agent_timeout_raw = float(os.getenv("HERMES_AGENT_TIMEOUT", 1800)) + except (ValueError, TypeError): + _agent_timeout_raw = 1800.0 _agent_timeout = _agent_timeout_raw if _agent_timeout_raw > 0 else None - _agent_warning_raw = float(os.getenv("HERMES_AGENT_TIMEOUT_WARNING", 900)) + try: + _agent_warning_raw = float(os.getenv("HERMES_AGENT_TIMEOUT_WARNING", 900)) + except (ValueError, TypeError): + _agent_warning_raw = 900.0 _agent_warning = _agent_warning_raw if _agent_warning_raw > 0 else None _warning_fired = False _executor_task = asyncio.ensure_future( @@ -12704,7 +12716,7 @@ def main(): if args.config: import yaml with open(args.config, encoding="utf-8") as f: - data = yaml.safe_load(f) + data = yaml.safe_load(f) or {} config = GatewayConfig.from_dict(data) # Run the gateway - exit with code 1 if no platforms connected,