fix(config): warn loudly on YAML parse failure instead of silent default fallback (#23585)

A YAML parse error in ~/.hermes/config.yaml caused load_config() to print
one line to stdout (Warning: Failed to load config: ...) and silently fall
back to DEFAULT_CONFIG, dropping every user override (auxiliary providers,
fallback chain, model settings). Users only noticed when downstream
behavior misbehaved — see issue #23570 where a tab-indent error in the
auxiliary section caused aux fallback to use OpenRouter (depleted) instead
of the configured Codex/MiniMax chain.

Now: log at WARNING (so 'hermes logs' surfaces it), write a prominent line
to stderr, dedup on (path, mtime_ns, size) so concurrent loads don't spam,
and re-warn after the user edits the file. Both call sites (raw read +
merged load) route through the same helper.

Refs #23570
This commit is contained in:
Teknium 2026-05-10 22:36:19 -07:00 committed by GitHub
parent 3af3c4eb8c
commit 228a4d11ae
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 120 additions and 2 deletions

View file

@ -28,6 +28,48 @@ from typing import Dict, Any, Optional, List, Tuple
logger = logging.getLogger(__name__)
# Track which (config_path, mtime_ns, size) tuples we've already warned about
# so concurrent CLI/gateway loads of a broken config.yaml don't spam stderr
# every time. Cleared automatically when the file changes (different mtime).
_CONFIG_PARSE_WARNED: set = set()
def _warn_config_parse_failure(config_path: Path, exc: Exception) -> None:
"""Surface a config.yaml parse failure to user, log, and stderr.
A YAML parse error in ``~/.hermes/config.yaml`` causes ``load_config()``
to silently fall back to ``DEFAULT_CONFIG``, which means every user
override (auxiliary providers, fallback chain, model overrides, etc.)
is dropped. Before this helper that was a one-line ``print(...)`` that
scrolled off-screen on the first invocation and was never seen again.
Now: warn once per (path, mtime_ns, size) on stderr **and** in
``agent.log`` / ``errors.log`` at WARNING level so ``hermes logs``
surfaces it. Re-warns automatically if the file changes (different
mtime/size), so users editing the config see the next failure.
"""
try:
st = config_path.stat()
key = (str(config_path), st.st_mtime_ns, st.st_size)
except OSError:
key = (str(config_path), 0, 0)
if key in _CONFIG_PARSE_WARNED:
return
_CONFIG_PARSE_WARNED.add(key)
msg = (
f"Failed to parse {config_path}: {exc}. "
f"Falling back to default config — every user override "
f"(auxiliary providers, fallback chain, model settings) is being IGNORED. "
f"Fix the YAML and restart."
)
logger.warning(msg)
try:
sys.stderr.write(f"⚠️ hermes config: {msg}\n")
sys.stderr.flush()
except Exception:
pass
_IS_WINDOWS = platform.system() == "Windows"
_ENV_VAR_NAME_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
_LAST_EXPANDED_CONFIG_BY_PATH: Dict[str, Any] = {}
@ -4004,7 +4046,8 @@ def read_raw_config() -> Dict[str, Any]:
try:
with open(config_path, encoding="utf-8") as f:
data = yaml.safe_load(f) or {}
except Exception:
except Exception as e:
_warn_config_parse_failure(config_path, e)
return {}
if not isinstance(data, dict):
@ -4054,7 +4097,7 @@ def load_config() -> Dict[str, Any]:
config = _deep_merge(config, user_config)
except Exception as e:
print(f"Warning: Failed to load config: {e}")
_warn_config_parse_failure(config_path, e)
normalized = _normalize_root_model_keys(_normalize_max_turns_config(config))
expanded = _expand_env_vars(normalized)

View file

@ -81,6 +81,81 @@ class TestLoadConfigDefaults:
assert "max_turns" not in config
class TestLoadConfigParseFailure:
"""A YAML parse failure must NOT silently fall back to defaults.
Before issue #23570 this was a single ``print(...)`` that scrolled past
on the first invocation users saw aux-fallback misbehavior with no clue
their config.yaml was being ignored. The helper must:
* log at WARNING (so ``hermes logs`` surfaces it)
* also write to stderr (so it's visible at startup even before
``setup_logging()`` has wired up file handlers)
* dedup on (path, mtime_ns, size) so concurrent loads don't spam
* re-warn after the user edits the file (different mtime)
"""
def test_logs_and_warns_on_parse_failure(self, tmp_path, caplog, capsys):
# Reset the dedup cache so this test isn't affected by other tests
# that may have warned about a different broken config.
from hermes_cli import config as cfg_mod
cfg_mod._CONFIG_PARSE_WARNED.clear()
with patch.dict(os.environ, {"HERMES_HOME": str(tmp_path)}):
(tmp_path / "config.yaml").write_text("\tbroken tab indent:\n")
import logging
with caplog.at_level(logging.WARNING, logger="hermes_cli.config"):
config = load_config()
# Falls back to defaults — confirms the silent-fallback we're warning about
assert config["model"] == DEFAULT_CONFIG["model"]
# WARNING-level log was emitted with file path + reason
assert any(
str(tmp_path / "config.yaml") in rec.message
and "Falling back to default config" in rec.message
for rec in caplog.records
), f"expected WARNING log, got: {[r.message for r in caplog.records]}"
# stderr also got a user-visible message (with the ⚠️ marker so it
# stands out at hermes startup before logging is configured)
captured = capsys.readouterr()
assert "hermes config:" in captured.err
assert str(tmp_path / "config.yaml") in captured.err
def test_dedup_on_repeated_load_same_file(self, tmp_path, capsys):
from hermes_cli import config as cfg_mod
cfg_mod._CONFIG_PARSE_WARNED.clear()
with patch.dict(os.environ, {"HERMES_HOME": str(tmp_path)}):
(tmp_path / "config.yaml").write_text("\tbroken:\n")
load_config()
first = capsys.readouterr().err
assert "hermes config:" in first
load_config()
second = capsys.readouterr().err
assert second == "", "second load should NOT re-warn (same file, same mtime)"
def test_rewarns_after_file_edit(self, tmp_path, capsys):
import time
from hermes_cli import config as cfg_mod
cfg_mod._CONFIG_PARSE_WARNED.clear()
with patch.dict(os.environ, {"HERMES_HOME": str(tmp_path)}):
(tmp_path / "config.yaml").write_text("\tbroken:\n")
load_config()
capsys.readouterr() # discard first warning
# Edit the file (still broken, but different content) — mtime changes
time.sleep(0.05)
(tmp_path / "config.yaml").write_text("\tstill broken differently:\n")
load_config()
after_edit = capsys.readouterr().err
assert "hermes config:" in after_edit, "edited file should re-warn"
class TestSaveAndLoadRoundtrip:
def test_roundtrip(self, tmp_path):
with patch.dict(os.environ, {"HERMES_HOME": str(tmp_path)}):