From 73b92264ee08cc25dfee3b8854ce0c94f6534a5b Mon Sep 17 00:00:00 2001 From: konsisumer Date: Sun, 21 Jun 2026 12:21:33 +0530 Subject: [PATCH] fix(cron): resolve model.default + fail fast on missing model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cron jobs created without an explicit `model` are stored as `model: null`. At fire time `run_job` resolved `model = job.get("model") or os.getenv( "HERMES_MODEL") or ""` and then `_model_cfg.get("default", model)`, so when config.yaml had no `model.default` (or `model: {default: null}`) an empty string flowed straight to the provider and surfaced as an opaque HTTP 400 ("Model parameter is required" / "model: String should have at least 1 character"). The operator had to inspect jobs.json to discover the job was stored with a null model. This change makes cron model resolution robust and symmetric with the CLI: - Coerce `model: null`/missing config to `{}` so a falsy default never overwrites an already-resolved env value with `None`. - Only overwrite `model` from `model.default` when the resolved value is truthy; accept a `model.model` alias key, mirroring the sibling resolvers in hermes_cli/oneshot.py, fallback_cmd.py and prompt_size.py. - Resolve AFTER the managed-scope overlay so an administrator-pinned model still wins. - Fail fast with an actionable error (caught by run_job's outer handler and recorded as the job's last_error — the cron ticker is unaffected) instead of letting an empty model reach the API. - The per-job model is re-read every tick, so a `cronjob action=update model=...` after a failed run takes effect on the next tick (no cache). Adds tests/cron/conftest.py pinning a default HERMES_MODEL so existing run_job tests don't trip the new guard, plus regression tests covering env fallback, config.default fallback, string-form config, the model alias key, null-default-no-clobber, corrupt-config graceful degradation, fail-fast, and the no-cache re-read property. Salvaged from #24005, rebased onto current main, with additional test coverage folded in from #45550 and the alias-key behavior from #43952. Fixes #43899 Fixes #23979 Fixes #22761 Co-authored-by: szzhoujiarui-sketch Co-authored-by: rayjun --- cron/scheduler.py | 28 ++++- tests/cron/conftest.py | 21 ++++ tests/cron/test_scheduler.py | 233 +++++++++++++++++++++++++++++++++++ 3 files changed, 280 insertions(+), 2 deletions(-) create mode 100644 tests/cron/conftest.py diff --git a/cron/scheduler.py b/cron/scheduler.py index bd8ac6fdd8e..0956528b132 100644 --- a/cron/scheduler.py +++ b/cron/scheduler.py @@ -1652,6 +1652,11 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]: else str(delivery_target["thread_id"]) ) + # Model resolution precedence: per-job override > HERMES_MODEL env > + # config.yaml ``model:`` (string or ``{default: ...}``). The per-job + # value is intentionally re-read from storage every tick so a + # ``cronjob action=update model=...`` after a failed run takes effect + # on the next tick — there is no in-memory cache. model = job.get("model") or os.getenv("HERMES_MODEL") or "" # Load config.yaml for model, reasoning, prefill, toolsets, provider routing @@ -1672,15 +1677,34 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]: except Exception: pass _cfg = _expand_env_vars(_cfg) - _model_cfg = _cfg.get("model", {}) + # Coerce null/missing to {} so a falsy default never + # clobbers an already-resolved env value with ``None``. + _model_cfg = _cfg.get("model") or {} if not job.get("model"): if isinstance(_model_cfg, str): model = _model_cfg elif isinstance(_model_cfg, dict): - model = _model_cfg.get("default", model) + # Mirror the CLI/oneshot resolution: prefer ``default``, + # accept a ``model`` alias, overwrite only when truthy. + _default = _model_cfg.get("default") or _model_cfg.get("model") + if _default: + model = _default except Exception as e: logger.warning("Job '%s': failed to load config.yaml, using defaults: %s", job_id, e) + # Fail fast if no model resolved from job / env / config.yaml: an empty + # model otherwise reaches the provider as an opaque 400 (#23979). + if not (isinstance(model, str) and model.strip()): + raise RuntimeError( + f"Cron job '{job_name}' has no model configured " + f"(job.model={job.get('model')!r}, " + f"HERMES_MODEL={os.getenv('HERMES_MODEL', '')!r}, " + "config.yaml model.default missing or empty). " + f"Set a per-job model via " + f"`cronjob action=update job_id={job_id} model=` or set a " + "default with `hermes model `." + ) + # Apply IPv4 preference if configured. try: from hermes_constants import apply_ipv4_preference diff --git a/tests/cron/conftest.py b/tests/cron/conftest.py new file mode 100644 index 00000000000..caaec455948 --- /dev/null +++ b/tests/cron/conftest.py @@ -0,0 +1,21 @@ +"""Cron-test fixtures. + +Provides a default ``HERMES_MODEL`` for cron run_job tests so each one +doesn't have to spell out a model. The global conftest blanks +HERMES_MODEL hermetically; without this autouse fixture every cron test +that exercises ``run_job`` would hit the fail-fast guard added in +``cron/scheduler.py`` (see issue #23979) and have to be rewritten. + +Tests that specifically need ``HERMES_MODEL`` unset — model-resolution +edge cases — call ``monkeypatch.delenv("HERMES_MODEL", raising=False)`` +inside the test, which overrides this fixture's value for that scope. +""" + +import pytest + + +@pytest.fixture(autouse=True) +def _default_cron_test_model(monkeypatch): + """Pin a default HERMES_MODEL so cron run_job tests have a resolvable model.""" + monkeypatch.setenv("HERMES_MODEL", "test-cron-default-model") + yield diff --git a/tests/cron/test_scheduler.py b/tests/cron/test_scheduler.py index fd445de8ca6..a13e943ad3c 100644 --- a/tests/cron/test_scheduler.py +++ b/tests/cron/test_scheduler.py @@ -1616,6 +1616,7 @@ class TestRunJobConfigEnvVarExpansion: def test_fallback_model_env_ref_in_config_yaml_is_expanded(self, tmp_path, monkeypatch): """${VAR} in config.yaml fallback_providers model: is expanded.""" (tmp_path / "config.yaml").write_text( + "model: primary-model\n" "fallback_providers:\n" " - provider: openrouter\n" " model: ${_HERMES_TEST_CRON_FALLBACK}\n" @@ -1672,6 +1673,238 @@ class TestRunJobConfigEnvVarExpansion: assert kwargs["model"] == "${_HERMES_TEST_CRON_UNSET_VAR}" +class TestRunJobModelResolution: + """Verify defensive model resolution for jobs stored with ``model: null``. + + Issue #23979: a cron job created without an explicit model is stored as + ``model: null``. At fire time the scheduler must: + 1. fall back to ``HERMES_MODEL`` env if set, + 2. else fall back to config.yaml ``model.default`` if set, + 3. else fail fast with an actionable error — never let an empty string + reach the provider where it surfaces as an opaque 400. + """ + + _RUNTIME = { + "api_key": "test-key", + "base_url": "https://example.invalid/v1", + "provider": "openrouter", + "api_mode": "chat_completions", + } + + def test_null_job_model_falls_back_to_env(self, tmp_path, monkeypatch): + """``model: null`` on the job uses HERMES_MODEL when set.""" + (tmp_path / "config.yaml").write_text("") + monkeypatch.setenv("HERMES_MODEL", "env-model") + + job = {"id": "null-model-job", "name": "null model", "prompt": "hi", "model": None} + fake_db = MagicMock() + + with patch("cron.scheduler._hermes_home", tmp_path), \ + patch("cron.scheduler._resolve_origin", return_value=None), \ + patch("dotenv.load_dotenv"), \ + patch("hermes_state.SessionDB", return_value=fake_db), \ + patch("hermes_cli.runtime_provider.resolve_runtime_provider", + return_value=self._RUNTIME), \ + patch("run_agent.AIAgent") as mock_agent_cls: + mock_agent = MagicMock() + mock_agent.run_conversation.return_value = {"final_response": "ok"} + mock_agent_cls.return_value = mock_agent + success, _, _, error = run_job(job) + + assert success is True + assert error is None + assert mock_agent_cls.call_args.kwargs["model"] == "env-model" + + def test_null_job_model_falls_back_to_config_default(self, tmp_path, monkeypatch): + """``model: null`` on the job uses config.yaml model.default when env is empty.""" + (tmp_path / "config.yaml").write_text("model:\n default: config-default-model\n") + monkeypatch.delenv("HERMES_MODEL", raising=False) + + job = {"id": "cfg-default-job", "name": "cfg default", "prompt": "hi", "model": None} + fake_db = MagicMock() + + with patch("cron.scheduler._hermes_home", tmp_path), \ + patch("cron.scheduler._resolve_origin", return_value=None), \ + patch("dotenv.load_dotenv"), \ + patch("hermes_state.SessionDB", return_value=fake_db), \ + patch("hermes_cli.runtime_provider.resolve_runtime_provider", + return_value=self._RUNTIME), \ + patch("run_agent.AIAgent") as mock_agent_cls: + mock_agent = MagicMock() + mock_agent.run_conversation.return_value = {"final_response": "ok"} + mock_agent_cls.return_value = mock_agent + success, _, _, error = run_job(job) + + assert success is True + assert error is None + assert mock_agent_cls.call_args.kwargs["model"] == "config-default-model" + + def test_explicit_null_model_block_in_config_does_not_overwrite_env(self, tmp_path, monkeypatch): + """``model: null`` in config.yaml must not overwrite a resolved HERMES_MODEL. + + Regression: before #23979 the resolver coerced ``model: null`` to + ``{}`` only via the ``.get("model", {})`` default — which does not + fire when the key is present with a None value. The resolver then + skipped both branches and kept the env value, but a similar + ``model: {default: null}`` shape would call ``.get("default", model)`` + which returns ``None`` and clobbered ``model``. + """ + (tmp_path / "config.yaml").write_text("model:\n default: null\n") + monkeypatch.setenv("HERMES_MODEL", "env-model") + + job = {"id": "null-default-job", "name": "null default", "prompt": "hi", "model": None} + fake_db = MagicMock() + + with patch("cron.scheduler._hermes_home", tmp_path), \ + patch("cron.scheduler._resolve_origin", return_value=None), \ + patch("dotenv.load_dotenv"), \ + patch("hermes_state.SessionDB", return_value=fake_db), \ + patch("hermes_cli.runtime_provider.resolve_runtime_provider", + return_value=self._RUNTIME), \ + patch("run_agent.AIAgent") as mock_agent_cls: + mock_agent = MagicMock() + mock_agent.run_conversation.return_value = {"final_response": "ok"} + mock_agent_cls.return_value = mock_agent + success, _, _, error = run_job(job) + + assert success is True + assert mock_agent_cls.call_args.kwargs["model"] == "env-model" + + def test_no_model_anywhere_fails_with_actionable_error(self, tmp_path, monkeypatch): + """All three sources empty → fail fast with a clear message, not an opaque 400.""" + (tmp_path / "config.yaml").write_text("") + monkeypatch.delenv("HERMES_MODEL", raising=False) + + job = {"id": "no-model-job", "name": "no model anywhere", "prompt": "hi", "model": None} + fake_db = MagicMock() + + with patch("cron.scheduler._hermes_home", tmp_path), \ + patch("cron.scheduler._resolve_origin", return_value=None), \ + patch("dotenv.load_dotenv"), \ + patch("hermes_state.SessionDB", return_value=fake_db), \ + patch("hermes_cli.runtime_provider.resolve_runtime_provider", + return_value=self._RUNTIME), \ + patch("run_agent.AIAgent") as mock_agent_cls: + success, _, _, error = run_job(job) + + assert success is False + assert error is not None + assert "no model configured" in error + # AIAgent must never be constructed with an empty model — that's + # precisely the bug we're guarding against. + mock_agent_cls.assert_not_called() + + def test_job_model_update_takes_effect_on_next_run(self, tmp_path, monkeypatch): + """The per-job model is re-read every tick — no in-memory cache. + + This is the property the original bug report asked for. We verify + it by calling run_job twice with the same job dict mutated between + calls, simulating the storage update flow. + """ + (tmp_path / "config.yaml").write_text("") + monkeypatch.delenv("HERMES_MODEL", raising=False) + + job = {"id": "updated-model-job", "name": "updated", "prompt": "hi", "model": "first-model"} + fake_db = MagicMock() + + with patch("cron.scheduler._hermes_home", tmp_path), \ + patch("cron.scheduler._resolve_origin", return_value=None), \ + patch("dotenv.load_dotenv"), \ + patch("hermes_state.SessionDB", return_value=fake_db), \ + patch("hermes_cli.runtime_provider.resolve_runtime_provider", + return_value=self._RUNTIME), \ + patch("run_agent.AIAgent") as mock_agent_cls: + mock_agent = MagicMock() + mock_agent.run_conversation.return_value = {"final_response": "ok"} + mock_agent_cls.return_value = mock_agent + + run_job(job) + assert mock_agent_cls.call_args.kwargs["model"] == "first-model" + + job["model"] = "second-model" # simulates jobs.json being rewritten + run_job(job) + assert mock_agent_cls.call_args.kwargs["model"] == "second-model" + + def test_config_model_as_plain_string(self, tmp_path, monkeypatch): + """config.yaml ``model:`` given as a bare string is used directly.""" + (tmp_path / "config.yaml").write_text("model: string-form-model\n") + monkeypatch.delenv("HERMES_MODEL", raising=False) + + job = {"id": "string-cfg-job", "name": "string cfg", "prompt": "hi", "model": None} + fake_db = MagicMock() + + with patch("cron.scheduler._hermes_home", tmp_path), \ + patch("cron.scheduler._resolve_origin", return_value=None), \ + patch("dotenv.load_dotenv"), \ + patch("hermes_state.SessionDB", return_value=fake_db), \ + patch("hermes_cli.runtime_provider.resolve_runtime_provider", + return_value=self._RUNTIME), \ + patch("run_agent.AIAgent") as mock_agent_cls: + mock_agent = MagicMock() + mock_agent.run_conversation.return_value = {"final_response": "ok"} + mock_agent_cls.return_value = mock_agent + success, _, _, error = run_job(job) + + assert success is True + assert error is None + assert mock_agent_cls.call_args.kwargs["model"] == "string-form-model" + + def test_config_model_alias_key_resolves(self, tmp_path, monkeypatch): + """A ``model: {model: ...}`` alias key resolves like the CLI sibling. + + ``hermes_cli/oneshot.py``, ``fallback_cmd.py`` and ``prompt_size.py`` + all accept ``model.model`` as an alias for ``model.default``. The cron + resolver mirrors that so a config that works in the CLI also works in + cron. + """ + (tmp_path / "config.yaml").write_text("model:\n model: alias-key-model\n") + monkeypatch.delenv("HERMES_MODEL", raising=False) + + job = {"id": "alias-job", "name": "alias", "prompt": "hi", "model": None} + fake_db = MagicMock() + + with patch("cron.scheduler._hermes_home", tmp_path), \ + patch("cron.scheduler._resolve_origin", return_value=None), \ + patch("dotenv.load_dotenv"), \ + patch("hermes_state.SessionDB", return_value=fake_db), \ + patch("hermes_cli.runtime_provider.resolve_runtime_provider", + return_value=self._RUNTIME), \ + patch("run_agent.AIAgent") as mock_agent_cls: + mock_agent = MagicMock() + mock_agent.run_conversation.return_value = {"final_response": "ok"} + mock_agent_cls.return_value = mock_agent + success, _, _, error = run_job(job) + + assert success is True + assert error is None + assert mock_agent_cls.call_args.kwargs["model"] == "alias-key-model" + + def test_corrupt_config_yaml_does_not_crash_with_job_model(self, tmp_path, monkeypatch): + """A malformed config.yaml degrades gracefully when the job has a model.""" + (tmp_path / "config.yaml").write_text("{{{invalid yaml!!!") + monkeypatch.delenv("HERMES_MODEL", raising=False) + + job = {"id": "corrupt-job", "name": "corrupt", "prompt": "hi", "model": "explicit-model"} + fake_db = MagicMock() + + with patch("cron.scheduler._hermes_home", tmp_path), \ + patch("cron.scheduler._resolve_origin", return_value=None), \ + patch("dotenv.load_dotenv"), \ + patch("hermes_state.SessionDB", return_value=fake_db), \ + patch("hermes_cli.runtime_provider.resolve_runtime_provider", + return_value=self._RUNTIME), \ + patch("run_agent.AIAgent") as mock_agent_cls: + mock_agent = MagicMock() + mock_agent.run_conversation.return_value = {"final_response": "ok"} + mock_agent_cls.return_value = mock_agent + success, _, _, error = run_job(job) + + # Explicit job model survives the corrupt-config fall-through. + assert success is True + assert error is None + assert mock_agent_cls.call_args.kwargs["model"] == "explicit-model" + + class TestRunJobSkillBacked: def test_run_job_preserves_skill_env_passthrough_into_worker_thread(self, tmp_path): job = {