diff --git a/gateway/platforms/whatsapp.py b/gateway/platforms/whatsapp.py index a82417a601..b3e655a51b 100644 --- a/gateway/platforms/whatsapp.py +++ b/gateway/platforms/whatsapp.py @@ -185,6 +185,13 @@ class WhatsAppAdapter(BasePlatformAdapter): self._bridge_log: Optional[Path] = None self._poll_task: Optional[asyncio.Task] = None self._http_session: Optional["aiohttp.ClientSession"] = None + # Set to True by disconnect() before we SIGTERM our child bridge so + # _check_managed_bridge_exit() can distinguish an intentional + # shutdown-time exit (returncode -15 / -2 / 0) from a real crash. + # Without this, every graceful gateway shutdown/restart would log + # "Fatal whatsapp adapter error" plus dispatch a fatal-error + # notification before the normal "✓ whatsapp disconnected" fires. + self._shutting_down: bool = False def _whatsapp_require_mention(self) -> bool: configured = self.config.extra.get("require_mention") @@ -555,6 +562,21 @@ class WhatsAppAdapter(BasePlatformAdapter): if returncode is None: return None + # Planned shutdown: disconnect() sets _shutting_down before it sends + # SIGTERM to the bridge, so a returncode of -15 (SIGTERM), -2 (SIGINT), + # or 0 (clean exit) at that point is expected, not a crash. Treat it + # as informational and skip the fatal-error path. + # getattr-with-default keeps tests that construct the adapter via + # ``WhatsAppAdapter.__new__`` (bypassing __init__) working without + # every _make_adapter() helper having to seed the attribute. + if getattr(self, "_shutting_down", False) and returncode in (0, -2, -15): + logger.info( + "[%s] Bridge exited during shutdown (code %d).", + self.name, + returncode, + ) + return None + message = f"WhatsApp bridge process exited unexpectedly (code {returncode})." if not self.has_fatal_error: logger.error("[%s] %s", self.name, message) @@ -565,6 +587,10 @@ class WhatsAppAdapter(BasePlatformAdapter): async def disconnect(self) -> None: """Stop the WhatsApp bridge and clean up any orphaned processes.""" + # Flip the shutdown flag BEFORE signalling the child so the exit-check + # path (which runs from other tasks like send() and the poll loop) + # doesn't race us and report the intentional termination as fatal. + self._shutting_down = True if self._bridge_process: try: try: diff --git a/gateway/run.py b/gateway/run.py index 23c67eec09..db6fcc9756 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -407,37 +407,37 @@ if _config_path.exists(): os.environ[_env_map["base_url"]] = _base_url if _api_key: os.environ[_env_map["api_key"]] = _api_key + # config.yaml is the documented, authoritative source for these + # settings — it unconditionally wins over .env values. Previously + # the guards below read `if X not in os.environ` and let stale + # .env entries (e.g. HERMES_MAX_ITERATIONS=60 written by an old + # `hermes setup` run) silently shadow the user's current config. + # See PR #18413 / the 60-vs-500 max_turns incident. _agent_cfg = _cfg.get("agent", {}) if _agent_cfg and isinstance(_agent_cfg, dict): if "max_turns" in _agent_cfg: os.environ["HERMES_MAX_ITERATIONS"] = str(_agent_cfg["max_turns"]) - # Bridge agent.gateway_timeout → HERMES_AGENT_TIMEOUT env var. - # Env var from .env takes precedence (already in os.environ). - if "gateway_timeout" in _agent_cfg and "HERMES_AGENT_TIMEOUT" not in os.environ: + if "gateway_timeout" in _agent_cfg: os.environ["HERMES_AGENT_TIMEOUT"] = str(_agent_cfg["gateway_timeout"]) - if "gateway_timeout_warning" in _agent_cfg and "HERMES_AGENT_TIMEOUT_WARNING" not in os.environ: + if "gateway_timeout_warning" in _agent_cfg: os.environ["HERMES_AGENT_TIMEOUT_WARNING"] = str(_agent_cfg["gateway_timeout_warning"]) - if "gateway_notify_interval" in _agent_cfg and "HERMES_AGENT_NOTIFY_INTERVAL" not in os.environ: + if "gateway_notify_interval" in _agent_cfg: os.environ["HERMES_AGENT_NOTIFY_INTERVAL"] = str(_agent_cfg["gateway_notify_interval"]) - if "restart_drain_timeout" in _agent_cfg and "HERMES_RESTART_DRAIN_TIMEOUT" not in os.environ: + if "restart_drain_timeout" in _agent_cfg: os.environ["HERMES_RESTART_DRAIN_TIMEOUT"] = str(_agent_cfg["restart_drain_timeout"]) - if ( - "gateway_auto_continue_freshness" in _agent_cfg - and "HERMES_AUTO_CONTINUE_FRESHNESS" not in os.environ - ): + if "gateway_auto_continue_freshness" in _agent_cfg: os.environ["HERMES_AUTO_CONTINUE_FRESHNESS"] = str( _agent_cfg["gateway_auto_continue_freshness"] ) _display_cfg = _cfg.get("display", {}) if _display_cfg and isinstance(_display_cfg, dict): - if "busy_input_mode" in _display_cfg and "HERMES_GATEWAY_BUSY_INPUT_MODE" not in os.environ: + if "busy_input_mode" in _display_cfg: os.environ["HERMES_GATEWAY_BUSY_INPUT_MODE"] = str(_display_cfg["busy_input_mode"]) - if "busy_ack_enabled" in _display_cfg and "HERMES_GATEWAY_BUSY_ACK_ENABLED" not in os.environ: + if "busy_ack_enabled" in _display_cfg: os.environ["HERMES_GATEWAY_BUSY_ACK_ENABLED"] = str(_display_cfg["busy_ack_enabled"]) # Timezone: bridge config.yaml → HERMES_TIMEZONE env var. - # HERMES_TIMEZONE from .env takes precedence (already in os.environ). _tz_cfg = _cfg.get("timezone", "") - if _tz_cfg and isinstance(_tz_cfg, str) and "HERMES_TIMEZONE" not in os.environ: + if _tz_cfg and isinstance(_tz_cfg, str): os.environ["HERMES_TIMEZONE"] = _tz_cfg.strip() # Security settings _security_cfg = _cfg.get("security", {}) @@ -445,8 +445,24 @@ if _config_path.exists(): _redact = _security_cfg.get("redact_secrets") if _redact is not None: os.environ["HERMES_REDACT_SECRETS"] = str(_redact).lower() - except Exception: - pass # Non-fatal; gateway can still run with .env values + except Exception as _bridge_err: + # Previously this was silent (`except Exception: pass`), which + # hid partial bridge failures and let .env defaults shadow + # config.yaml values — users observed max_turns=500 in config + # but a 60-iteration cap in practice. Surface the failure to + # stderr so operators see it even though `logger` is not yet + # initialized at module-import time (logger is defined further + # down this module). + print( + f" Warning: config.yaml → env bridge failed: " + f"{type(_bridge_err).__name__}: {_bridge_err}", + file=sys.stderr, + ) + print( + " Gateway will fall back to .env values, which may not match " + "your current config.yaml. Run `hermes doctor` to investigate.", + file=sys.stderr, + ) # Apply IPv4 preference if configured (before any HTTP clients are created). try: @@ -2584,6 +2600,18 @@ class GatewayRunner: """ logger.info("Starting Hermes Gateway...") logger.info("Session storage: %s", self.config.sessions_dir) + # Log the resolved max_iterations budget so operators can verify the + # config.yaml → env bridge did the right thing at a glance (instead + # of silently running at a stale .env value for weeks). + try: + _effective_max_iter = int(os.getenv("HERMES_MAX_ITERATIONS", "90")) + logger.info( + "Agent budget: max_iterations=%d (agent.max_turns from config.yaml, " + "or HERMES_MAX_ITERATIONS from .env, or default 90)", + _effective_max_iter, + ) + except Exception: + pass try: from hermes_cli.profiles import get_active_profile_name _profile = get_active_profile_name() @@ -10453,16 +10481,28 @@ class GatewayRunner: return metadata = {"thread_id": thread_id} if thread_id else None - await adapter.send( + result = await adapter.send( chat_id, "♻ Gateway restarted successfully. Your session continues.", metadata=metadata, ) - logger.info( - "Sent restart notification to %s:%s", - platform_str, - chat_id, - ) + # adapter.send() catches provider errors (e.g. "Chat not found") + # and returns SendResult(success=False) rather than raising, so + # we must inspect the result before claiming success — otherwise + # the log line is misleading and hides real delivery failures. + if getattr(result, "success", False): + logger.info( + "Sent restart notification to %s:%s", + platform_str, + chat_id, + ) + else: + logger.warning( + "Restart notification to %s:%s was not delivered: %s", + platform_str, + chat_id, + getattr(result, "error", "unknown error"), + ) except Exception as e: logger.warning("Restart notification failed: %s", e) finally: diff --git a/hermes_cli/config.py b/hermes_cli/config.py index fe989619bb..17e10c08d6 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -400,7 +400,12 @@ DEFAULT_CONFIG = { # The gateway stops accepting new work, waits for running agents # to finish, then interrupts any remaining runs after the timeout. # 0 = no drain, interrupt immediately. - "restart_drain_timeout": 60, + # + # 180s is calibrated for realistic in-flight agent turns: a typical + # coding conversation mid-reasoning runs 60–150s per call, so a 60s + # budget routinely interrupted legitimate work on /restart. Raise + # further in config.yaml if you run very-long-reasoning models. + "restart_drain_timeout": 180, # Max app-level retry attempts for API errors (connection drops, # provider timeouts, 5xx, etc.) before the agent surfaces the # failure. The OpenAI SDK already does its own low-level retries diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 3933ad8494..8f32e2cbd8 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -1643,7 +1643,11 @@ def setup_terminal_backend(config: dict): def _apply_default_agent_settings(config: dict): """Apply recommended defaults for all agent settings without prompting.""" config.setdefault("agent", {})["max_turns"] = 90 - save_env_value("HERMES_MAX_ITERATIONS", "90") + # config.yaml is the authoritative source for max_turns; the gateway + # bridges it into HERMES_MAX_ITERATIONS at startup. We no longer write + # to .env to avoid the dual-source inconsistency that caused the + # 60-vs-500 bug (stale .env entry silently shadowing config.yaml). + remove_env_value("HERMES_MAX_ITERATIONS") config.setdefault("display", {})["tool_progress"] = "all" @@ -1673,9 +1677,10 @@ def setup_agent_settings(config: dict): print() # ── Max Iterations ── - current_max = get_env_value("HERMES_MAX_ITERATIONS") or str( - cfg_get(config, "agent", "max_turns", default=90) - ) + # config.yaml is authoritative; read from there. If a legacy .env + # entry is still around (from pre-PR#18413 setups), prefer the + # config value so we don't surface a stale number to the user. + current_max = str(cfg_get(config, "agent", "max_turns", default=90)) print_info("Maximum tool-calling iterations per conversation.") print_info("Higher = more complex tasks, but costs more tokens.") print_info( @@ -1686,9 +1691,13 @@ def setup_agent_settings(config: dict): try: max_iter = int(max_iter_str) if max_iter > 0: - save_env_value("HERMES_MAX_ITERATIONS", str(max_iter)) + # Write to config.yaml (authoritative) only. Also clean up any + # stale .env entry from earlier setup runs — the gateway's + # bridge in gateway/run.py now unconditionally derives + # HERMES_MAX_ITERATIONS from agent.max_turns at startup. config.setdefault("agent", {})["max_turns"] = max_iter config.pop("max_turns", None) + remove_env_value("HERMES_MAX_ITERATIONS") print_success(f"Max iterations set to {max_iter}") except ValueError: print_warning("Invalid number, keeping current value") diff --git a/tests/gateway/test_config_env_bridge_authority.py b/tests/gateway/test_config_env_bridge_authority.py new file mode 100644 index 0000000000..26c54f1c73 --- /dev/null +++ b/tests/gateway/test_config_env_bridge_authority.py @@ -0,0 +1,166 @@ +"""Regression tests for the config.yaml → env var bridge in gateway/run.py. + +Guards against the 60-vs-500 bug where a stale `.env HERMES_MAX_ITERATIONS=60` +entry silently shadowed `agent.max_turns: 500` in config.yaml because the +bridge used `if X not in os.environ` guards. After PR#18413 the bridge +treats config.yaml as authoritative and unconditionally overwrites .env +values for `agent.*`, `display.*`, `timezone`, and `security.*` keys. +""" + +from __future__ import annotations + +import os +import subprocess +import sys +import textwrap +from pathlib import Path + +import pytest + + +PROJECT_ROOT = Path(__file__).resolve().parents[2] + + +def _run_gateway_import(hermes_home: Path, initial_env: dict[str, str]) -> dict[str, str]: + """Import gateway.run in a clean subprocess and return the post-import env. + + The bridge runs at module-import time, so simply importing is enough + to exercise it. Running in a subprocess isolates the test from other + import side effects and makes the "what ends up in os.environ" check + deterministic. + """ + script = textwrap.dedent( + f""" + import os, sys + sys.path.insert(0, {str(PROJECT_ROOT)!r}) + + try: + from gateway import run # noqa: F401 — module import triggers bridge + except Exception as exc: + print(f"IMPORT_ERROR:{{type(exc).__name__}}:{{exc}}", file=sys.stderr) + sys.exit(2) + + for k in ( + "HERMES_MAX_ITERATIONS", + "HERMES_AGENT_TIMEOUT", + "HERMES_AGENT_TIMEOUT_WARNING", + "HERMES_GATEWAY_BUSY_INPUT_MODE", + "HERMES_TIMEZONE", + ): + v = os.environ.get(k) + if v is not None: + print(f"{{k}}={{v}}") + """ + ) + env = dict(initial_env) + env["HERMES_HOME"] = str(hermes_home) + # Keep PATH / PYTHONPATH so venv imports resolve. + for k in ("PATH", "PYTHONPATH", "VIRTUAL_ENV", "HOME"): + if k in os.environ and k not in env: + env[k] = os.environ[k] + + result = subprocess.run( + [sys.executable, "-c", script], + env=env, + capture_output=True, + text=True, + timeout=60, + ) + if result.returncode != 0: + pytest.fail( + f"gateway.run import failed (rc={result.returncode})\n" + f"stderr:\n{result.stderr}\nstdout:\n{result.stdout}" + ) + out: dict[str, str] = {} + for line in result.stdout.splitlines(): + if "=" in line: + k, v = line.split("=", 1) + out[k] = v + return out + + +def _write_config(home: Path, agent_cfg: dict | None = None, display_cfg: dict | None = None, + timezone: str | None = None) -> None: + import yaml + cfg: dict = {} + if agent_cfg: + cfg["agent"] = agent_cfg + if display_cfg: + cfg["display"] = display_cfg + if timezone: + cfg["timezone"] = timezone + (home / "config.yaml").write_text(yaml.safe_dump(cfg)) + + +def _write_env(home: Path, entries: dict[str, str]) -> None: + lines = [f"{k}={v}\n" for k, v in entries.items()] + (home / ".env").write_text("".join(lines)) + + +@pytest.fixture +def hermes_home(tmp_path: Path) -> Path: + home = tmp_path / ".hermes" + home.mkdir() + return home + + +def test_config_max_turns_wins_over_stale_env(hermes_home: Path) -> None: + """Regression: config.yaml:agent.max_turns=500 must beat .env=60.""" + _write_config(hermes_home, agent_cfg={"max_turns": 500}) + _write_env(hermes_home, {"HERMES_MAX_ITERATIONS": "60"}) + + env = _run_gateway_import(hermes_home, initial_env={}) + + assert env.get("HERMES_MAX_ITERATIONS") == "500", ( + f"expected config.yaml max_turns=500 to win; got {env.get('HERMES_MAX_ITERATIONS')!r}. " + "Stale .env value is shadowing config — the bridge lost its override." + ) + + +def test_config_gateway_timeout_wins_over_stale_env(hermes_home: Path) -> None: + """Every agent.* bridge key must be config-authoritative, not .env-authoritative.""" + _write_config(hermes_home, agent_cfg={ + "gateway_timeout": 1800, + "gateway_timeout_warning": 900, + }) + _write_env(hermes_home, { + "HERMES_AGENT_TIMEOUT": "60", + "HERMES_AGENT_TIMEOUT_WARNING": "30", + }) + + env = _run_gateway_import(hermes_home, initial_env={}) + + assert env.get("HERMES_AGENT_TIMEOUT") == "1800" + assert env.get("HERMES_AGENT_TIMEOUT_WARNING") == "900" + + +def test_config_display_busy_input_mode_wins_over_stale_env(hermes_home: Path) -> None: + _write_config(hermes_home, display_cfg={"busy_input_mode": "interrupt"}) + _write_env(hermes_home, {"HERMES_GATEWAY_BUSY_INPUT_MODE": "queue"}) + + env = _run_gateway_import(hermes_home, initial_env={}) + + assert env.get("HERMES_GATEWAY_BUSY_INPUT_MODE") == "interrupt" + + +def test_config_timezone_wins_over_stale_env(hermes_home: Path) -> None: + _write_config(hermes_home, timezone="America/Los_Angeles") + _write_env(hermes_home, {"HERMES_TIMEZONE": "UTC"}) + + env = _run_gateway_import(hermes_home, initial_env={}) + + assert env.get("HERMES_TIMEZONE") == "America/Los_Angeles" + + +def test_env_value_survives_when_config_omits_key(hermes_home: Path) -> None: + """If config.yaml doesn't set max_turns, .env value must still pass through. + + The bridge only overwrites when the config key is present — an absent + config key should NOT clobber the .env value. + """ + _write_config(hermes_home, agent_cfg={}) # no max_turns + _write_env(hermes_home, {"HERMES_MAX_ITERATIONS": "123"}) + + env = _run_gateway_import(hermes_home, initial_env={}) + + assert env.get("HERMES_MAX_ITERATIONS") == "123" diff --git a/tests/gateway/test_restart_notification.py b/tests/gateway/test_restart_notification.py index 8297dfc32f..254917897f 100644 --- a/tests/gateway/test_restart_notification.py +++ b/tests/gateway/test_restart_notification.py @@ -242,4 +242,89 @@ async def test_send_restart_notification_cleans_up_on_send_failure( await runner._send_restart_notification() - assert not notify_path.exists() # cleaned up despite error + # File cleaned up even though send raised. + assert not notify_path.exists() + + +@pytest.mark.asyncio +async def test_send_restart_notification_logs_warning_on_sendresult_failure( + tmp_path, monkeypatch, caplog +): + """Adapter that returns SendResult(success=False) must log a WARNING, not INFO. + + Regression guard: adapter.send() catches provider errors (e.g. Telegram + "Chat not found") and returns SendResult(success=False) rather than + raising. The caller previously ignored the return value and always + logged "Sent restart notification to ..." at INFO — masking real + delivery failures behind a fake success line. + """ + from gateway.platforms.base import SendResult + + monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path) + + notify_path = tmp_path / ".restart_notify.json" + notify_path.write_text(json.dumps({ + "platform": "telegram", + "chat_id": "42", + })) + + runner, adapter = make_restart_runner() + adapter.send = AsyncMock( + return_value=SendResult(success=False, error="Chat not found"), + ) + + with caplog.at_level("DEBUG", logger="gateway.run"): + await runner._send_restart_notification() + + success_lines = [ + r for r in caplog.records + if r.levelname == "INFO" and "Sent restart notification" in r.getMessage() + ] + warning_lines = [ + r for r in caplog.records + if r.levelname == "WARNING" + and "was not delivered" in r.getMessage() + and "Chat not found" in r.getMessage() + ] + assert not success_lines, ( + "Expected no INFO 'Sent restart notification' line when send failed, " + f"got: {[r.getMessage() for r in success_lines]}" + ) + assert warning_lines, ( + "Expected a WARNING line mentioning the failure; " + f"got records: {[(r.levelname, r.getMessage()) for r in caplog.records]}" + ) + # Still cleans up. + assert not notify_path.exists() + + +@pytest.mark.asyncio +async def test_send_restart_notification_logs_info_on_sendresult_success( + tmp_path, monkeypatch, caplog +): + """Adapter returning SendResult(success=True) keeps the INFO log line.""" + from gateway.platforms.base import SendResult + + monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path) + + notify_path = tmp_path / ".restart_notify.json" + notify_path.write_text(json.dumps({ + "platform": "telegram", + "chat_id": "42", + })) + + runner, adapter = make_restart_runner() + adapter.send = AsyncMock(return_value=SendResult(success=True, message_id="m-1")) + + with caplog.at_level("DEBUG", logger="gateway.run"): + await runner._send_restart_notification() + + success_lines = [ + r for r in caplog.records + if r.levelname == "INFO" and "Sent restart notification" in r.getMessage() + ] + assert success_lines, ( + "Expected INFO 'Sent restart notification' when send succeeded; " + f"got records: {[(r.levelname, r.getMessage()) for r in caplog.records]}" + ) + assert not notify_path.exists() diff --git a/tests/gateway/test_whatsapp_connect.py b/tests/gateway/test_whatsapp_connect.py index 29f7eee3af..0a359fb751 100644 --- a/tests/gateway/test_whatsapp_connect.py +++ b/tests/gateway/test_whatsapp_connect.py @@ -284,6 +284,66 @@ class TestBridgeRuntimeFailure: mock_fh.close.assert_called_once() assert adapter._bridge_log_fh is None + @pytest.mark.asyncio + @pytest.mark.parametrize("returncode", [0, -2, -15]) + async def test_shutdown_suppresses_fatal_on_planned_bridge_exit(self, returncode): + """During graceful disconnect(), SIGTERM/SIGINT/clean-exit are NOT fatal. + + Regression guard for the bug where every gateway shutdown/restart + logged "Fatal whatsapp adapter error (whatsapp_bridge_exited)" and + dispatched a fatal-error notification just before the normal + "✓ whatsapp disconnected" — because _check_managed_bridge_exit() + saw the bridge's returncode of -15 (our own SIGTERM) and classified + it as an unexpected crash. + """ + adapter = _make_adapter() + fatal_handler = AsyncMock() + adapter.set_fatal_error_handler(fatal_handler) + adapter._running = True + adapter._http_session = MagicMock() + adapter._bridge_log_fh = MagicMock() + adapter._shutting_down = True # disconnect() sets this before SIGTERM + + mock_proc = MagicMock() + mock_proc.poll.return_value = returncode + adapter._bridge_process = mock_proc + + result = await adapter._check_managed_bridge_exit() + + assert result is None, ( + f"returncode={returncode} during shutdown should be suppressed, " + f"got fatal message: {result!r}" + ) + assert adapter.fatal_error_code is None + fatal_handler.assert_not_awaited() + + @pytest.mark.asyncio + async def test_shutdown_still_surfaces_nonzero_crash(self): + """Even during shutdown, a truly crashed bridge (e.g. returncode 9) is fatal. + + The suppression list is deliberately narrow (0, -2, -15) so that + OOM-kill (137), assertion failures, or custom error exits still + reach the fatal-error handler and user notification path. + """ + adapter = _make_adapter() + fatal_handler = AsyncMock() + adapter.set_fatal_error_handler(fatal_handler) + adapter._running = True + adapter._http_session = MagicMock() + adapter._bridge_log_fh = MagicMock() + adapter._shutting_down = True + + mock_proc = MagicMock() + mock_proc.poll.return_value = 137 # SIGKILL / OOM-kill + adapter._bridge_process = mock_proc + + result = await adapter._check_managed_bridge_exit() + + assert result is not None + assert "exited unexpectedly" in result + assert adapter.fatal_error_code == "whatsapp_bridge_exited" + fatal_handler.assert_awaited_once() + @pytest.mark.asyncio async def test_closed_when_http_not_ready(self): """Health endpoint never returns 200 within 15 attempts.""" diff --git a/tests/hermes_cli/test_setup_agent_settings.py b/tests/hermes_cli/test_setup_agent_settings.py index 868be7508c..b0e1d906ab 100644 --- a/tests/hermes_cli/test_setup_agent_settings.py +++ b/tests/hermes_cli/test_setup_agent_settings.py @@ -4,11 +4,16 @@ from hermes_cli.setup import setup_agent_settings def test_setup_agent_settings_uses_displayed_max_iterations_value(tmp_path, monkeypatch, capsys): - """The helper text should match the value shown in the prompt.""" + """The helper text should match the value shown in the prompt. + + After PR#18413 max_turns is read exclusively from config.yaml — the + .env `HERMES_MAX_ITERATIONS` fallback was removed because it was + shadowing the user's current config (see the 60-vs-500 incident). + """ monkeypatch.setenv("HERMES_HOME", str(tmp_path)) config = { - "agent": {"max_turns": 90}, + "agent": {"max_turns": 60}, "display": {"tool_progress": "all"}, "compression": {"threshold": 0.50}, "session_reset": {"mode": "both", "idle_minutes": 1440, "at_hour": 4}, @@ -16,10 +21,10 @@ def test_setup_agent_settings_uses_displayed_max_iterations_value(tmp_path, monk prompt_answers = iter(["60", "all", "0.5"]) - monkeypatch.setattr("hermes_cli.setup.get_env_value", lambda key: "60" if key == "HERMES_MAX_ITERATIONS" else "") monkeypatch.setattr("hermes_cli.setup.prompt", lambda *args, **kwargs: next(prompt_answers)) monkeypatch.setattr("hermes_cli.setup.prompt_choice", lambda *args, **kwargs: 4) monkeypatch.setattr("hermes_cli.setup.save_env_value", lambda *args, **kwargs: None) + monkeypatch.setattr("hermes_cli.setup.remove_env_value", lambda *args, **kwargs: None) monkeypatch.setattr("hermes_cli.setup.save_config", lambda *args, **kwargs: None) setup_agent_settings(config) @@ -27,3 +32,47 @@ def test_setup_agent_settings_uses_displayed_max_iterations_value(tmp_path, monk out = capsys.readouterr().out assert "Press Enter to keep 60." in out assert "Default is 90" not in out + + +def test_setup_agent_settings_prefers_config_over_stale_env(tmp_path, monkeypatch, capsys): + """Config.yaml wins even when a stale .env value disagrees. + + Regression guard for the bug where `.env HERMES_MAX_ITERATIONS=60` + from an old `hermes setup` run shadowed `agent.max_turns: 500` in + config.yaml. The wizard must now display the config value. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + + config = { + "agent": {"max_turns": 500}, # user bumped this in config.yaml + "display": {"tool_progress": "all"}, + "compression": {"threshold": 0.50}, + "session_reset": {"mode": "both", "idle_minutes": 1440, "at_hour": 4}, + } + + prompt_answers = iter(["500", "all", "0.5"]) + + # Simulate stale .env value — the wizard must ignore this. + monkeypatch.setattr( + "hermes_cli.setup.get_env_value", + lambda key: "60" if key == "HERMES_MAX_ITERATIONS" else "", + ) + monkeypatch.setattr("hermes_cli.setup.prompt", lambda *args, **kwargs: next(prompt_answers)) + monkeypatch.setattr("hermes_cli.setup.prompt_choice", lambda *args, **kwargs: 4) + monkeypatch.setattr("hermes_cli.setup.save_env_value", lambda *args, **kwargs: None) + + removed_keys: list[str] = [] + monkeypatch.setattr( + "hermes_cli.setup.remove_env_value", + lambda key: (removed_keys.append(key), True)[1], + ) + monkeypatch.setattr("hermes_cli.setup.save_config", lambda *args, **kwargs: None) + + setup_agent_settings(config) + + out = capsys.readouterr().out + # Config value wins + assert "Press Enter to keep 500." in out + assert "Press Enter to keep 60." not in out + # And the stale .env entry gets cleaned up + assert "HERMES_MAX_ITERATIONS" in removed_keys