fix(cron): resolve model.default + fail fast on missing model

Cron jobs created without an explicit `model` are stored as `model: null`. At fire time `run_job` resolved `model = job.get("model") or os.getenv( "HERMES_MODEL") or ""` and then `_model_cfg.get("default", model)`, so when config.yaml had no `model.default` (or `model: {default: null}`) an empty string flowed straight to the provider and surfaced as an opaque HTTP 400 ("Model parameter is required" / "model: String should have at least 1 character"). The operator had to inspect jobs.json to discover the job was stored with a null model. This change makes cron model resolution robust and symmetric with the CLI: - Coerce `model: null`/missing config to `{}` so a falsy default never overwrites an already-resolved env value with `None`. - Only overwrite `model` from `model.default` when the resolved value is truthy; accept a `model.model` alias key, mirroring the sibling resolvers in hermes_cli/oneshot.py, fallback_cmd.py and prompt_size.py. - Resolve AFTER the managed-scope overlay so an administrator-pinned model still wins. - Fail fast with an actionable error (caught by run_job's outer handler and recorded as the job's last_error — the cron ticker is unaffected) instead of letting an empty model reach the API. - The per-job model is re-read every tick, so a `cronjob action=update model=...` after a failed run takes effect on the next tick (no cache). Adds tests/cron/conftest.py pinning a default HERMES_MODEL so existing run_job tests don't trip the new guard, plus regression tests covering env fallback, config.default fallback, string-form config, the model alias key, null-default-no-clobber, corrupt-config graceful degradation, fail-fast, and the no-cache re-read property. Salvaged from #24005, rebased onto current main, with additional test coverage folded in from #45550 and the alias-key behavior from #43952. Fixes #43899 Fixes #23979 Fixes #22761 Co-authored-by: szzhoujiarui-sketch <szzhoujiarui@gmail.com> Co-authored-by: rayjun <rayjun0412@gmail.com>
2026-06-23 10:42:00 +00:00 · 2026-06-21 12:21:33 +05:30 · 2026-06-21 12:21:33 +05:30 · 73b92264ee
commit 73b92264ee
parent 14ef6312b5
3 changed files with 280 additions and 2 deletions
--- a/cron/scheduler.py
+++ b/cron/scheduler.py
@ -1652,6 +1652,11 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
                else str(delivery_target["thread_id"])
            )

+        # Model resolution precedence: per-job override > HERMES_MODEL env >
+        # config.yaml ``model:`` (string or ``{default: ...}``). The per-job
+        # value is intentionally re-read from storage every tick so a
+        # ``cronjob action=update model=...`` after a failed run takes effect
+        # on the next tick — there is no in-memory cache.
        model = job.get("model") or os.getenv("HERMES_MODEL") or ""

        # Load config.yaml for model, reasoning, prefill, toolsets, provider routing
@ -1672,15 +1677,34 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
                except Exception:
                    pass
                _cfg = _expand_env_vars(_cfg)
-                _model_cfg = _cfg.get("model", {})
+                # Coerce null/missing to {} so a falsy default never
+                # clobbers an already-resolved env value with ``None``.
+                _model_cfg = _cfg.get("model") or {}
                if not job.get("model"):
                    if isinstance(_model_cfg, str):
                        model = _model_cfg
                    elif isinstance(_model_cfg, dict):
-                        model = _model_cfg.get("default", model)
+                        # Mirror the CLI/oneshot resolution: prefer ``default``,
+                        # accept a ``model`` alias, overwrite only when truthy.
+                        _default = _model_cfg.get("default") or _model_cfg.get("model")
+                        if _default:
+                            model = _default
        except Exception as e:
            logger.warning("Job '%s': failed to load config.yaml, using defaults: %s", job_id, e)

+        # Fail fast if no model resolved from job / env / config.yaml: an empty
+        # model otherwise reaches the provider as an opaque 400 (#23979).
+        if not (isinstance(model, str) and model.strip()):
+            raise RuntimeError(
+                f"Cron job '{job_name}' has no model configured "
+                f"(job.model={job.get('model')!r}, "
+                f"HERMES_MODEL={os.getenv('HERMES_MODEL', '')!r}, "
+                "config.yaml model.default missing or empty). "
+                f"Set a per-job model via "
+                f"`cronjob action=update job_id={job_id} model=<name>` or set a "
+                "default with `hermes model <name>`."
+            )
+
        # Apply IPv4 preference if configured.
        try:
            from hermes_constants import apply_ipv4_preference
--- a/tests/cron/conftest.py
+++ b/tests/cron/conftest.py
@ -0,0 +1,21 @@
+"""Cron-test fixtures.
+
+Provides a default ``HERMES_MODEL`` for cron run_job tests so each one
+doesn't have to spell out a model. The global conftest blanks
+HERMES_MODEL hermetically; without this autouse fixture every cron test
+that exercises ``run_job`` would hit the fail-fast guard added in
+``cron/scheduler.py`` (see issue #23979) and have to be rewritten.
+
+Tests that specifically need ``HERMES_MODEL`` unset — model-resolution
+edge cases — call ``monkeypatch.delenv("HERMES_MODEL", raising=False)``
+inside the test, which overrides this fixture's value for that scope.
+"""
+
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def _default_cron_test_model(monkeypatch):
+    """Pin a default HERMES_MODEL so cron run_job tests have a resolvable model."""
+    monkeypatch.setenv("HERMES_MODEL", "test-cron-default-model")
+    yield
--- a/tests/cron/test_scheduler.py
+++ b/tests/cron/test_scheduler.py
@ -1616,6 +1616,7 @@ class TestRunJobConfigEnvVarExpansion:
    def test_fallback_model_env_ref_in_config_yaml_is_expanded(self, tmp_path, monkeypatch):
        """${VAR} in config.yaml fallback_providers model: is expanded."""
        (tmp_path / "config.yaml").write_text(
+            "model: primary-model\n"
            "fallback_providers:\n"
            "  - provider: openrouter\n"
            "    model: ${_HERMES_TEST_CRON_FALLBACK}\n"
@ -1672,6 +1673,238 @@ class TestRunJobConfigEnvVarExpansion:
        assert kwargs["model"] == "${_HERMES_TEST_CRON_UNSET_VAR}"


+class TestRunJobModelResolution:
+    """Verify defensive model resolution for jobs stored with ``model: null``.
+
+    Issue #23979: a cron job created without an explicit model is stored as
+    ``model: null``. At fire time the scheduler must:
+      1. fall back to ``HERMES_MODEL`` env if set,
+      2. else fall back to config.yaml ``model.default`` if set,
+      3. else fail fast with an actionable error — never let an empty string
+         reach the provider where it surfaces as an opaque 400.
+    """
+
+    _RUNTIME = {
+        "api_key": "test-key",
+        "base_url": "https://example.invalid/v1",
+        "provider": "openrouter",
+        "api_mode": "chat_completions",
+    }
+
+    def test_null_job_model_falls_back_to_env(self, tmp_path, monkeypatch):
+        """``model: null`` on the job uses HERMES_MODEL when set."""
+        (tmp_path / "config.yaml").write_text("")
+        monkeypatch.setenv("HERMES_MODEL", "env-model")
+
+        job = {"id": "null-model-job", "name": "null model", "prompt": "hi", "model": None}
+        fake_db = MagicMock()
+
+        with patch("cron.scheduler._hermes_home", tmp_path), \
+             patch("cron.scheduler._resolve_origin", return_value=None), \
+             patch("dotenv.load_dotenv"), \
+             patch("hermes_state.SessionDB", return_value=fake_db), \
+             patch("hermes_cli.runtime_provider.resolve_runtime_provider",
+                   return_value=self._RUNTIME), \
+             patch("run_agent.AIAgent") as mock_agent_cls:
+            mock_agent = MagicMock()
+            mock_agent.run_conversation.return_value = {"final_response": "ok"}
+            mock_agent_cls.return_value = mock_agent
+            success, _, _, error = run_job(job)
+
+        assert success is True
+        assert error is None
+        assert mock_agent_cls.call_args.kwargs["model"] == "env-model"
+
+    def test_null_job_model_falls_back_to_config_default(self, tmp_path, monkeypatch):
+        """``model: null`` on the job uses config.yaml model.default when env is empty."""
+        (tmp_path / "config.yaml").write_text("model:\n  default: config-default-model\n")
+        monkeypatch.delenv("HERMES_MODEL", raising=False)
+
+        job = {"id": "cfg-default-job", "name": "cfg default", "prompt": "hi", "model": None}
+        fake_db = MagicMock()
+
+        with patch("cron.scheduler._hermes_home", tmp_path), \
+             patch("cron.scheduler._resolve_origin", return_value=None), \
+             patch("dotenv.load_dotenv"), \
+             patch("hermes_state.SessionDB", return_value=fake_db), \
+             patch("hermes_cli.runtime_provider.resolve_runtime_provider",
+                   return_value=self._RUNTIME), \
+             patch("run_agent.AIAgent") as mock_agent_cls:
+            mock_agent = MagicMock()
+            mock_agent.run_conversation.return_value = {"final_response": "ok"}
+            mock_agent_cls.return_value = mock_agent
+            success, _, _, error = run_job(job)
+
+        assert success is True
+        assert error is None
+        assert mock_agent_cls.call_args.kwargs["model"] == "config-default-model"
+
+    def test_explicit_null_model_block_in_config_does_not_overwrite_env(self, tmp_path, monkeypatch):
+        """``model: null`` in config.yaml must not overwrite a resolved HERMES_MODEL.
+
+        Regression: before #23979 the resolver coerced ``model: null`` to
+        ``{}`` only via the ``.get("model", {})`` default — which does not
+        fire when the key is present with a None value. The resolver then
+        skipped both branches and kept the env value, but a similar
+        ``model: {default: null}`` shape would call ``.get("default", model)``
+        which returns ``None`` and clobbered ``model``.
+        """
+        (tmp_path / "config.yaml").write_text("model:\n  default: null\n")
+        monkeypatch.setenv("HERMES_MODEL", "env-model")
+
+        job = {"id": "null-default-job", "name": "null default", "prompt": "hi", "model": None}
+        fake_db = MagicMock()
+
+        with patch("cron.scheduler._hermes_home", tmp_path), \
+             patch("cron.scheduler._resolve_origin", return_value=None), \
+             patch("dotenv.load_dotenv"), \
+             patch("hermes_state.SessionDB", return_value=fake_db), \
+             patch("hermes_cli.runtime_provider.resolve_runtime_provider",
+                   return_value=self._RUNTIME), \
+             patch("run_agent.AIAgent") as mock_agent_cls:
+            mock_agent = MagicMock()
+            mock_agent.run_conversation.return_value = {"final_response": "ok"}
+            mock_agent_cls.return_value = mock_agent
+            success, _, _, error = run_job(job)
+
+        assert success is True
+        assert mock_agent_cls.call_args.kwargs["model"] == "env-model"
+
+    def test_no_model_anywhere_fails_with_actionable_error(self, tmp_path, monkeypatch):
+        """All three sources empty → fail fast with a clear message, not an opaque 400."""
+        (tmp_path / "config.yaml").write_text("")
+        monkeypatch.delenv("HERMES_MODEL", raising=False)
+
+        job = {"id": "no-model-job", "name": "no model anywhere", "prompt": "hi", "model": None}
+        fake_db = MagicMock()
+
+        with patch("cron.scheduler._hermes_home", tmp_path), \
+             patch("cron.scheduler._resolve_origin", return_value=None), \
+             patch("dotenv.load_dotenv"), \
+             patch("hermes_state.SessionDB", return_value=fake_db), \
+             patch("hermes_cli.runtime_provider.resolve_runtime_provider",
+                   return_value=self._RUNTIME), \
+             patch("run_agent.AIAgent") as mock_agent_cls:
+            success, _, _, error = run_job(job)
+
+        assert success is False
+        assert error is not None
+        assert "no model configured" in error
+        # AIAgent must never be constructed with an empty model — that's
+        # precisely the bug we're guarding against.
+        mock_agent_cls.assert_not_called()
+
+    def test_job_model_update_takes_effect_on_next_run(self, tmp_path, monkeypatch):
+        """The per-job model is re-read every tick — no in-memory cache.
+
+        This is the property the original bug report asked for. We verify
+        it by calling run_job twice with the same job dict mutated between
+        calls, simulating the storage update flow.
+        """
+        (tmp_path / "config.yaml").write_text("")
+        monkeypatch.delenv("HERMES_MODEL", raising=False)
+
+        job = {"id": "updated-model-job", "name": "updated", "prompt": "hi", "model": "first-model"}
+        fake_db = MagicMock()
+
+        with patch("cron.scheduler._hermes_home", tmp_path), \
+             patch("cron.scheduler._resolve_origin", return_value=None), \
+             patch("dotenv.load_dotenv"), \
+             patch("hermes_state.SessionDB", return_value=fake_db), \
+             patch("hermes_cli.runtime_provider.resolve_runtime_provider",
+                   return_value=self._RUNTIME), \
+             patch("run_agent.AIAgent") as mock_agent_cls:
+            mock_agent = MagicMock()
+            mock_agent.run_conversation.return_value = {"final_response": "ok"}
+            mock_agent_cls.return_value = mock_agent
+
+            run_job(job)
+            assert mock_agent_cls.call_args.kwargs["model"] == "first-model"
+
+            job["model"] = "second-model"  # simulates jobs.json being rewritten
+            run_job(job)
+            assert mock_agent_cls.call_args.kwargs["model"] == "second-model"
+
+    def test_config_model_as_plain_string(self, tmp_path, monkeypatch):
+        """config.yaml ``model:`` given as a bare string is used directly."""
+        (tmp_path / "config.yaml").write_text("model: string-form-model\n")
+        monkeypatch.delenv("HERMES_MODEL", raising=False)
+
+        job = {"id": "string-cfg-job", "name": "string cfg", "prompt": "hi", "model": None}
+        fake_db = MagicMock()
+
+        with patch("cron.scheduler._hermes_home", tmp_path), \
+             patch("cron.scheduler._resolve_origin", return_value=None), \
+             patch("dotenv.load_dotenv"), \
+             patch("hermes_state.SessionDB", return_value=fake_db), \
+             patch("hermes_cli.runtime_provider.resolve_runtime_provider",
+                   return_value=self._RUNTIME), \
+             patch("run_agent.AIAgent") as mock_agent_cls:
+            mock_agent = MagicMock()
+            mock_agent.run_conversation.return_value = {"final_response": "ok"}
+            mock_agent_cls.return_value = mock_agent
+            success, _, _, error = run_job(job)
+
+        assert success is True
+        assert error is None
+        assert mock_agent_cls.call_args.kwargs["model"] == "string-form-model"
+
+    def test_config_model_alias_key_resolves(self, tmp_path, monkeypatch):
+        """A ``model: {model: ...}`` alias key resolves like the CLI sibling.
+
+        ``hermes_cli/oneshot.py``, ``fallback_cmd.py`` and ``prompt_size.py``
+        all accept ``model.model`` as an alias for ``model.default``. The cron
+        resolver mirrors that so a config that works in the CLI also works in
+        cron.
+        """
+        (tmp_path / "config.yaml").write_text("model:\n  model: alias-key-model\n")
+        monkeypatch.delenv("HERMES_MODEL", raising=False)
+
+        job = {"id": "alias-job", "name": "alias", "prompt": "hi", "model": None}
+        fake_db = MagicMock()
+
+        with patch("cron.scheduler._hermes_home", tmp_path), \
+             patch("cron.scheduler._resolve_origin", return_value=None), \
+             patch("dotenv.load_dotenv"), \
+             patch("hermes_state.SessionDB", return_value=fake_db), \
+             patch("hermes_cli.runtime_provider.resolve_runtime_provider",
+                   return_value=self._RUNTIME), \
+             patch("run_agent.AIAgent") as mock_agent_cls:
+            mock_agent = MagicMock()
+            mock_agent.run_conversation.return_value = {"final_response": "ok"}
+            mock_agent_cls.return_value = mock_agent
+            success, _, _, error = run_job(job)
+
+        assert success is True
+        assert error is None
+        assert mock_agent_cls.call_args.kwargs["model"] == "alias-key-model"
+
+    def test_corrupt_config_yaml_does_not_crash_with_job_model(self, tmp_path, monkeypatch):
+        """A malformed config.yaml degrades gracefully when the job has a model."""
+        (tmp_path / "config.yaml").write_text("{{{invalid yaml!!!")
+        monkeypatch.delenv("HERMES_MODEL", raising=False)
+
+        job = {"id": "corrupt-job", "name": "corrupt", "prompt": "hi", "model": "explicit-model"}
+        fake_db = MagicMock()
+
+        with patch("cron.scheduler._hermes_home", tmp_path), \
+             patch("cron.scheduler._resolve_origin", return_value=None), \
+             patch("dotenv.load_dotenv"), \
+             patch("hermes_state.SessionDB", return_value=fake_db), \
+             patch("hermes_cli.runtime_provider.resolve_runtime_provider",
+                   return_value=self._RUNTIME), \
+             patch("run_agent.AIAgent") as mock_agent_cls:
+            mock_agent = MagicMock()
+            mock_agent.run_conversation.return_value = {"final_response": "ok"}
+            mock_agent_cls.return_value = mock_agent
+            success, _, _, error = run_job(job)
+
+        # Explicit job model survives the corrupt-config fall-through.
+        assert success is True
+        assert error is None
+        assert mock_agent_cls.call_args.kwargs["model"] == "explicit-model"
+
+
 class TestRunJobSkillBacked:
    def test_run_job_preserves_skill_env_passthrough_into_worker_thread(self, tmp_path):
        job = {