fix(doctor): detect + repair stale HERMES_MAX_ITERATIONS .env ghost shadowing config.yaml (#38222)

* fix(doctor): detect + repair stale HERMES_MAX_ITERATIONS .env ghost shadowing config.yaml

hermes doctor now flags when ~/.hermes/.env carries a HERMES_MAX_ITERATIONS
value that disagrees with agent.max_turns in config.yaml, and 'hermes doctor
--fix' removes the stale .env line so config.yaml is authoritative. 'hermes
config show' surfaces the same drift inline under Max turns.

The setup wizard stopped dual-writing this value, but users who edited only
config.yaml from a pre-fix install keep a .env ghost. The gateway bridge
normally overrides it at startup, but if the bridge bails on any earlier
config-parse error the ghost silently wins — config says 400 while the
gateway activity line reads N/90.

The detector reads the .env FILE directly (load_env), not get_env_value/
os.environ, since the startup bridge may already have overwritten os.environ
with the config value.

Closes #17534.

* fix(config): stop offering HERMES_MAX_ITERATIONS as an editable env var

Removes HERMES_MAX_ITERATIONS from OPTIONAL_ENV_VARS so the dashboard env
editor (PUT /api/env) and any env-var prompt no longer let a user write it
to .env — which would recreate the stale ghost that shadows config.yaml's
agent.max_turns (issue #17534). The iteration budget is configured only via
config.yaml; the env var stays a read-only backward-compat fallback in the
gateway/CLI, never a promoted write target.

Regression test asserts it is absent from OPTIONAL_ENV_VARS.
This commit is contained in:
Teknium 2026-06-03 06:38:40 -07:00 committed by GitHub
parent de26b17854
commit 6ee046a72f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 168 additions and 8 deletions

View file

@ -3346,13 +3346,6 @@ OPTIONAL_ENV_VARS = {
"password": True,
"category": "setting",
},
"HERMES_MAX_ITERATIONS": {
"description": "Maximum tool-calling iterations per conversation (default: 90)",
"prompt": "Max iterations",
"url": None,
"password": False,
"category": "setting",
},
# HERMES_TOOL_PROGRESS and HERMES_TOOL_PROGRESS_MODE are deprecated —
# now configured via display.tool_progress in config.yaml (off|new|all|verbose).
# Gateway falls back to these env vars for backward compatibility.
@ -5593,7 +5586,21 @@ def show_config():
print()
print(color("◆ Model", Colors.CYAN, Colors.BOLD))
print(f" Model: {config.get('model', 'not set')}")
print(f" Max turns: {config.get('agent', {}).get('max_turns', DEFAULT_CONFIG['agent']['max_turns'])}")
_cfg_max_turns = config.get('agent', {}).get('max_turns', DEFAULT_CONFIG['agent']['max_turns'])
print(f" Max turns: {_cfg_max_turns}")
# Warn on stale HERMES_MAX_ITERATIONS ghost in .env that disagrees with
# config.yaml (issue #17534). Read the .env FILE directly so we catch the
# ghost even when the gateway bridge already overrode os.environ.
try:
_env_ghost = load_env().get("HERMES_MAX_ITERATIONS")
if _env_ghost is not None and str(_env_ghost).strip() != str(_cfg_max_turns).strip():
print(color(
f" ⚠ .env has stale HERMES_MAX_ITERATIONS={_env_ghost} "
f"(run 'hermes doctor --fix' to remove)",
Colors.YELLOW,
))
except Exception:
pass
# Display
print()

View file

@ -891,6 +891,63 @@ def run_doctor(args):
except Exception:
pass
# Detect stale HERMES_MAX_ITERATIONS ghost in .env shadowing
# agent.max_turns in config.yaml (issue #17534). The setup wizard
# used to dual-write the iteration budget to both stores; users who
# later edit only config.yaml are left with a .env ghost. The gateway
# bridge normally derives HERMES_MAX_ITERATIONS from agent.max_turns
# at startup, but if that bridge bails (any earlier config-parse
# error), the stale .env value silently wins and the agent runs at the
# wrong budget — e.g. config says 400 but the activity line reads N/90.
# Read the .env FILE directly (load_env), not get_env_value/os.environ,
# which the startup bridge may already have overridden.
try:
import yaml
from hermes_cli.config import load_env, remove_env_value
with open(config_path, encoding="utf-8") as f:
raw_config = yaml.safe_load(f) or {}
agent_cfg = raw_config.get("agent")
cfg_max_turns = (
agent_cfg.get("max_turns")
if isinstance(agent_cfg, dict)
else None
)
# Legacy root-level key counts too.
if cfg_max_turns is None:
cfg_max_turns = raw_config.get("max_turns")
env_ghost = load_env().get("HERMES_MAX_ITERATIONS")
drift = (
cfg_max_turns is not None
and env_ghost is not None
and str(cfg_max_turns).strip() != str(env_ghost).strip()
)
if drift:
check_warn(
f"HERMES_MAX_ITERATIONS={env_ghost} in .env shadows "
f"agent.max_turns={cfg_max_turns} in config.yaml",
"(stale ghost from an earlier `hermes setup` run)",
)
if should_fix:
if remove_env_value("HERMES_MAX_ITERATIONS"):
check_ok(
"Removed stale HERMES_MAX_ITERATIONS from .env "
f"(config.yaml agent.max_turns={cfg_max_turns} is now authoritative)"
)
fixed_count += 1
else:
check_warn("Could not remove HERMES_MAX_ITERATIONS from .env")
manual_issues.append(
"Manually delete the HERMES_MAX_ITERATIONS line from "
f"{_DHH}/.env — config.yaml agent.max_turns is authoritative."
)
else:
issues.append(
"Stale HERMES_MAX_ITERATIONS in .env shadows config.yaml — "
"run 'hermes doctor --fix'"
)
except Exception:
pass
# Validate config structure (catches malformed custom_providers, etc.)
try:
from hermes_cli.config import validate_config_structure

View file

@ -486,6 +486,18 @@ class TestOptionalEnvVarsRegistry:
all_vars.extend(vars_list)
assert "TAVILY_API_KEY" in all_vars
def test_max_iterations_not_offered_as_env_var(self):
"""HERMES_MAX_ITERATIONS must NOT be in OPTIONAL_ENV_VARS (issue #17534).
Offering it as an editable env var (dashboard, `hermes setup`) lets a
user write it to .env, recreating the stale ghost that shadows
config.yaml's agent.max_turns. The iteration budget is configured ONLY
via config.yaml; HERMES_MAX_ITERATIONS remains a read-only backward-compat
fallback in the gateway/CLI, never a promoted write target.
"""
from hermes_cli.config import OPTIONAL_ENV_VARS
assert "HERMES_MAX_ITERATIONS" not in OPTIONAL_ENV_VARS
class TestConfigMigrationSecretPrompts:
def test_required_secret_env_prompt_uses_masked_prompt(self, tmp_path, monkeypatch):

View file

@ -1274,3 +1274,87 @@ class TestDoctorCodexCliHintPlacement:
minimax_idx = next(i for i, l in enumerate(lines) if "MiniMax OAuth" in l)
assert self._hint_line() not in lines[minimax_idx - 1]
assert minimax_idx + 1 >= len(lines) or self._hint_line() not in lines[minimax_idx + 1]
class TestDoctorStaleMaxIterationsDrift:
"""Regression for #17534: a stale HERMES_MAX_ITERATIONS in .env shadows
agent.max_turns in config.yaml. The repro symptom is config.yaml saying
400 while the gateway activity line reads N/90. Doctor must detect the
drift, and `--fix` must remove the .env ghost (config.yaml wins).
The detector reads the .env FILE directly, NOT os.environ the gateway
startup bridge can already have overridden os.environ to the config value,
so the ghost is only visible in the file.
"""
def _run_config_section(self, monkeypatch, tmp_path, *, fix, ghost, cfg_turns,
os_environ_value=None):
import pathlib
import contextlib
import io
from argparse import Namespace
hermes_home = tmp_path / ".hermes"
hermes_home.mkdir(parents=True)
(hermes_home / "config.yaml").write_text(
f"agent:\n max_turns: {cfg_turns}\n", encoding="utf-8"
)
env_lines = ["OPENAI_API_KEY=sk-test\n"]
if ghost is not None:
env_lines.append(f"HERMES_MAX_ITERATIONS={ghost}\n")
(hermes_home / ".env").write_text("".join(env_lines), encoding="utf-8")
monkeypatch.setattr(doctor_mod, "HERMES_HOME", hermes_home)
monkeypatch.setattr(doctor_mod, "get_hermes_home", lambda: hermes_home)
# Point the config helpers at the temp home.
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
if os_environ_value is not None:
# Simulate the gateway bridge having already overridden os.environ.
monkeypatch.setenv("HERMES_MAX_ITERATIONS", str(os_environ_value))
else:
monkeypatch.delenv("HERMES_MAX_ITERATIONS", raising=False)
# Short-circuit at the Tool Availability stage — the drift check runs
# well before it in the Configuration Files section.
fake_model_tools = types.SimpleNamespace(
check_tool_availability=lambda *a, **kw: (_ for _ in ()).throw(SystemExit(0)),
TOOLSET_REQUIREMENTS={},
)
monkeypatch.setitem(sys.modules, "model_tools", fake_model_tools)
buf = io.StringIO()
with contextlib.redirect_stdout(buf), pytest.raises(SystemExit):
doctor_mod.run_doctor(Namespace(fix=fix))
return buf.getvalue(), hermes_home
def test_detects_drift_warn_only(self, monkeypatch, tmp_path):
out, hermes_home = self._run_config_section(
monkeypatch, tmp_path, fix=False, ghost=90, cfg_turns=400,
os_environ_value=400, # bridge contaminated os.environ
)
assert "HERMES_MAX_ITERATIONS=90" in out
assert "shadows" in out
# Warn-only must NOT mutate .env.
assert "HERMES_MAX_ITERATIONS=90" in (hermes_home / ".env").read_text(encoding="utf-8")
def test_fix_removes_ghost(self, monkeypatch, tmp_path):
out, hermes_home = self._run_config_section(
monkeypatch, tmp_path, fix=True, ghost=90, cfg_turns=400,
os_environ_value=400,
)
assert "Removed stale HERMES_MAX_ITERATIONS" in out
env_after = (hermes_home / ".env").read_text(encoding="utf-8")
assert "HERMES_MAX_ITERATIONS" not in env_after
assert "OPENAI_API_KEY=sk-test" in env_after # other keys preserved
def test_no_drift_when_values_match(self, monkeypatch, tmp_path):
out, _ = self._run_config_section(
monkeypatch, tmp_path, fix=False, ghost=400, cfg_turns=400,
)
assert "shadows" not in out
def test_no_drift_when_ghost_absent(self, monkeypatch, tmp_path):
out, _ = self._run_config_section(
monkeypatch, tmp_path, fix=False, ghost=None, cfg_turns=400,
)
assert "shadows" not in out