fix(cron): resolve model.default + fail fast on missing model

Cron jobs created without an explicit `model` are stored as `model: null`.
At fire time `run_job` resolved `model = job.get("model") or os.getenv(
"HERMES_MODEL") or ""` and then `_model_cfg.get("default", model)`, so when
config.yaml had no `model.default` (or `model: {default: null}`) an empty
string flowed straight to the provider and surfaced as an opaque HTTP 400
("Model parameter is required" / "model: String should have at least 1
character"). The operator had to inspect jobs.json to discover the job was
stored with a null model.

This change makes cron model resolution robust and symmetric with the CLI:

- Coerce `model: null`/missing config to `{}` so a falsy default never
  overwrites an already-resolved env value with `None`.
- Only overwrite `model` from `model.default` when the resolved value is
  truthy; accept a `model.model` alias key, mirroring the sibling resolvers
  in hermes_cli/oneshot.py, fallback_cmd.py and prompt_size.py.
- Resolve AFTER the managed-scope overlay so an administrator-pinned model
  still wins.
- Fail fast with an actionable error (caught by run_job's outer handler and
  recorded as the job's last_error — the cron ticker is unaffected) instead
  of letting an empty model reach the API.
- The per-job model is re-read every tick, so a `cronjob action=update
  model=...` after a failed run takes effect on the next tick (no cache).

Adds tests/cron/conftest.py pinning a default HERMES_MODEL so existing
run_job tests don't trip the new guard, plus regression tests covering env
fallback, config.default fallback, string-form config, the model alias key,
null-default-no-clobber, corrupt-config graceful degradation, fail-fast,
and the no-cache re-read property.

Salvaged from #24005, rebased onto current main, with additional test
coverage folded in from #45550 and the alias-key behavior from #43952.

Fixes #43899
Fixes #23979
Fixes #22761

Co-authored-by: szzhoujiarui-sketch <szzhoujiarui@gmail.com>
Co-authored-by: rayjun <rayjun0412@gmail.com>
This commit is contained in:
konsisumer 2026-06-21 12:21:33 +05:30 committed by kshitij
parent 14ef6312b5
commit 73b92264ee
3 changed files with 280 additions and 2 deletions

View file

@ -1652,6 +1652,11 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
else str(delivery_target["thread_id"])
)
# Model resolution precedence: per-job override > HERMES_MODEL env >
# config.yaml ``model:`` (string or ``{default: ...}``). The per-job
# value is intentionally re-read from storage every tick so a
# ``cronjob action=update model=...`` after a failed run takes effect
# on the next tick — there is no in-memory cache.
model = job.get("model") or os.getenv("HERMES_MODEL") or ""
# Load config.yaml for model, reasoning, prefill, toolsets, provider routing
@ -1672,15 +1677,34 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
except Exception:
pass
_cfg = _expand_env_vars(_cfg)
_model_cfg = _cfg.get("model", {})
# Coerce null/missing to {} so a falsy default never
# clobbers an already-resolved env value with ``None``.
_model_cfg = _cfg.get("model") or {}
if not job.get("model"):
if isinstance(_model_cfg, str):
model = _model_cfg
elif isinstance(_model_cfg, dict):
model = _model_cfg.get("default", model)
# Mirror the CLI/oneshot resolution: prefer ``default``,
# accept a ``model`` alias, overwrite only when truthy.
_default = _model_cfg.get("default") or _model_cfg.get("model")
if _default:
model = _default
except Exception as e:
logger.warning("Job '%s': failed to load config.yaml, using defaults: %s", job_id, e)
# Fail fast if no model resolved from job / env / config.yaml: an empty
# model otherwise reaches the provider as an opaque 400 (#23979).
if not (isinstance(model, str) and model.strip()):
raise RuntimeError(
f"Cron job '{job_name}' has no model configured "
f"(job.model={job.get('model')!r}, "
f"HERMES_MODEL={os.getenv('HERMES_MODEL', '')!r}, "
"config.yaml model.default missing or empty). "
f"Set a per-job model via "
f"`cronjob action=update job_id={job_id} model=<name>` or set a "
"default with `hermes model <name>`."
)
# Apply IPv4 preference if configured.
try:
from hermes_constants import apply_ipv4_preference

21
tests/cron/conftest.py Normal file
View file

@ -0,0 +1,21 @@
"""Cron-test fixtures.
Provides a default ``HERMES_MODEL`` for cron run_job tests so each one
doesn't have to spell out a model. The global conftest blanks
HERMES_MODEL hermetically; without this autouse fixture every cron test
that exercises ``run_job`` would hit the fail-fast guard added in
``cron/scheduler.py`` (see issue #23979) and have to be rewritten.
Tests that specifically need ``HERMES_MODEL`` unset model-resolution
edge cases call ``monkeypatch.delenv("HERMES_MODEL", raising=False)``
inside the test, which overrides this fixture's value for that scope.
"""
import pytest
@pytest.fixture(autouse=True)
def _default_cron_test_model(monkeypatch):
"""Pin a default HERMES_MODEL so cron run_job tests have a resolvable model."""
monkeypatch.setenv("HERMES_MODEL", "test-cron-default-model")
yield

View file

@ -1616,6 +1616,7 @@ class TestRunJobConfigEnvVarExpansion:
def test_fallback_model_env_ref_in_config_yaml_is_expanded(self, tmp_path, monkeypatch):
"""${VAR} in config.yaml fallback_providers model: is expanded."""
(tmp_path / "config.yaml").write_text(
"model: primary-model\n"
"fallback_providers:\n"
" - provider: openrouter\n"
" model: ${_HERMES_TEST_CRON_FALLBACK}\n"
@ -1672,6 +1673,238 @@ class TestRunJobConfigEnvVarExpansion:
assert kwargs["model"] == "${_HERMES_TEST_CRON_UNSET_VAR}"
class TestRunJobModelResolution:
"""Verify defensive model resolution for jobs stored with ``model: null``.
Issue #23979: a cron job created without an explicit model is stored as
``model: null``. At fire time the scheduler must:
1. fall back to ``HERMES_MODEL`` env if set,
2. else fall back to config.yaml ``model.default`` if set,
3. else fail fast with an actionable error never let an empty string
reach the provider where it surfaces as an opaque 400.
"""
_RUNTIME = {
"api_key": "test-key",
"base_url": "https://example.invalid/v1",
"provider": "openrouter",
"api_mode": "chat_completions",
}
def test_null_job_model_falls_back_to_env(self, tmp_path, monkeypatch):
"""``model: null`` on the job uses HERMES_MODEL when set."""
(tmp_path / "config.yaml").write_text("")
monkeypatch.setenv("HERMES_MODEL", "env-model")
job = {"id": "null-model-job", "name": "null model", "prompt": "hi", "model": None}
fake_db = MagicMock()
with patch("cron.scheduler._hermes_home", tmp_path), \
patch("cron.scheduler._resolve_origin", return_value=None), \
patch("dotenv.load_dotenv"), \
patch("hermes_state.SessionDB", return_value=fake_db), \
patch("hermes_cli.runtime_provider.resolve_runtime_provider",
return_value=self._RUNTIME), \
patch("run_agent.AIAgent") as mock_agent_cls:
mock_agent = MagicMock()
mock_agent.run_conversation.return_value = {"final_response": "ok"}
mock_agent_cls.return_value = mock_agent
success, _, _, error = run_job(job)
assert success is True
assert error is None
assert mock_agent_cls.call_args.kwargs["model"] == "env-model"
def test_null_job_model_falls_back_to_config_default(self, tmp_path, monkeypatch):
"""``model: null`` on the job uses config.yaml model.default when env is empty."""
(tmp_path / "config.yaml").write_text("model:\n default: config-default-model\n")
monkeypatch.delenv("HERMES_MODEL", raising=False)
job = {"id": "cfg-default-job", "name": "cfg default", "prompt": "hi", "model": None}
fake_db = MagicMock()
with patch("cron.scheduler._hermes_home", tmp_path), \
patch("cron.scheduler._resolve_origin", return_value=None), \
patch("dotenv.load_dotenv"), \
patch("hermes_state.SessionDB", return_value=fake_db), \
patch("hermes_cli.runtime_provider.resolve_runtime_provider",
return_value=self._RUNTIME), \
patch("run_agent.AIAgent") as mock_agent_cls:
mock_agent = MagicMock()
mock_agent.run_conversation.return_value = {"final_response": "ok"}
mock_agent_cls.return_value = mock_agent
success, _, _, error = run_job(job)
assert success is True
assert error is None
assert mock_agent_cls.call_args.kwargs["model"] == "config-default-model"
def test_explicit_null_model_block_in_config_does_not_overwrite_env(self, tmp_path, monkeypatch):
"""``model: null`` in config.yaml must not overwrite a resolved HERMES_MODEL.
Regression: before #23979 the resolver coerced ``model: null`` to
``{}`` only via the ``.get("model", {})`` default which does not
fire when the key is present with a None value. The resolver then
skipped both branches and kept the env value, but a similar
``model: {default: null}`` shape would call ``.get("default", model)``
which returns ``None`` and clobbered ``model``.
"""
(tmp_path / "config.yaml").write_text("model:\n default: null\n")
monkeypatch.setenv("HERMES_MODEL", "env-model")
job = {"id": "null-default-job", "name": "null default", "prompt": "hi", "model": None}
fake_db = MagicMock()
with patch("cron.scheduler._hermes_home", tmp_path), \
patch("cron.scheduler._resolve_origin", return_value=None), \
patch("dotenv.load_dotenv"), \
patch("hermes_state.SessionDB", return_value=fake_db), \
patch("hermes_cli.runtime_provider.resolve_runtime_provider",
return_value=self._RUNTIME), \
patch("run_agent.AIAgent") as mock_agent_cls:
mock_agent = MagicMock()
mock_agent.run_conversation.return_value = {"final_response": "ok"}
mock_agent_cls.return_value = mock_agent
success, _, _, error = run_job(job)
assert success is True
assert mock_agent_cls.call_args.kwargs["model"] == "env-model"
def test_no_model_anywhere_fails_with_actionable_error(self, tmp_path, monkeypatch):
"""All three sources empty → fail fast with a clear message, not an opaque 400."""
(tmp_path / "config.yaml").write_text("")
monkeypatch.delenv("HERMES_MODEL", raising=False)
job = {"id": "no-model-job", "name": "no model anywhere", "prompt": "hi", "model": None}
fake_db = MagicMock()
with patch("cron.scheduler._hermes_home", tmp_path), \
patch("cron.scheduler._resolve_origin", return_value=None), \
patch("dotenv.load_dotenv"), \
patch("hermes_state.SessionDB", return_value=fake_db), \
patch("hermes_cli.runtime_provider.resolve_runtime_provider",
return_value=self._RUNTIME), \
patch("run_agent.AIAgent") as mock_agent_cls:
success, _, _, error = run_job(job)
assert success is False
assert error is not None
assert "no model configured" in error
# AIAgent must never be constructed with an empty model — that's
# precisely the bug we're guarding against.
mock_agent_cls.assert_not_called()
def test_job_model_update_takes_effect_on_next_run(self, tmp_path, monkeypatch):
"""The per-job model is re-read every tick — no in-memory cache.
This is the property the original bug report asked for. We verify
it by calling run_job twice with the same job dict mutated between
calls, simulating the storage update flow.
"""
(tmp_path / "config.yaml").write_text("")
monkeypatch.delenv("HERMES_MODEL", raising=False)
job = {"id": "updated-model-job", "name": "updated", "prompt": "hi", "model": "first-model"}
fake_db = MagicMock()
with patch("cron.scheduler._hermes_home", tmp_path), \
patch("cron.scheduler._resolve_origin", return_value=None), \
patch("dotenv.load_dotenv"), \
patch("hermes_state.SessionDB", return_value=fake_db), \
patch("hermes_cli.runtime_provider.resolve_runtime_provider",
return_value=self._RUNTIME), \
patch("run_agent.AIAgent") as mock_agent_cls:
mock_agent = MagicMock()
mock_agent.run_conversation.return_value = {"final_response": "ok"}
mock_agent_cls.return_value = mock_agent
run_job(job)
assert mock_agent_cls.call_args.kwargs["model"] == "first-model"
job["model"] = "second-model" # simulates jobs.json being rewritten
run_job(job)
assert mock_agent_cls.call_args.kwargs["model"] == "second-model"
def test_config_model_as_plain_string(self, tmp_path, monkeypatch):
"""config.yaml ``model:`` given as a bare string is used directly."""
(tmp_path / "config.yaml").write_text("model: string-form-model\n")
monkeypatch.delenv("HERMES_MODEL", raising=False)
job = {"id": "string-cfg-job", "name": "string cfg", "prompt": "hi", "model": None}
fake_db = MagicMock()
with patch("cron.scheduler._hermes_home", tmp_path), \
patch("cron.scheduler._resolve_origin", return_value=None), \
patch("dotenv.load_dotenv"), \
patch("hermes_state.SessionDB", return_value=fake_db), \
patch("hermes_cli.runtime_provider.resolve_runtime_provider",
return_value=self._RUNTIME), \
patch("run_agent.AIAgent") as mock_agent_cls:
mock_agent = MagicMock()
mock_agent.run_conversation.return_value = {"final_response": "ok"}
mock_agent_cls.return_value = mock_agent
success, _, _, error = run_job(job)
assert success is True
assert error is None
assert mock_agent_cls.call_args.kwargs["model"] == "string-form-model"
def test_config_model_alias_key_resolves(self, tmp_path, monkeypatch):
"""A ``model: {model: ...}`` alias key resolves like the CLI sibling.
``hermes_cli/oneshot.py``, ``fallback_cmd.py`` and ``prompt_size.py``
all accept ``model.model`` as an alias for ``model.default``. The cron
resolver mirrors that so a config that works in the CLI also works in
cron.
"""
(tmp_path / "config.yaml").write_text("model:\n model: alias-key-model\n")
monkeypatch.delenv("HERMES_MODEL", raising=False)
job = {"id": "alias-job", "name": "alias", "prompt": "hi", "model": None}
fake_db = MagicMock()
with patch("cron.scheduler._hermes_home", tmp_path), \
patch("cron.scheduler._resolve_origin", return_value=None), \
patch("dotenv.load_dotenv"), \
patch("hermes_state.SessionDB", return_value=fake_db), \
patch("hermes_cli.runtime_provider.resolve_runtime_provider",
return_value=self._RUNTIME), \
patch("run_agent.AIAgent") as mock_agent_cls:
mock_agent = MagicMock()
mock_agent.run_conversation.return_value = {"final_response": "ok"}
mock_agent_cls.return_value = mock_agent
success, _, _, error = run_job(job)
assert success is True
assert error is None
assert mock_agent_cls.call_args.kwargs["model"] == "alias-key-model"
def test_corrupt_config_yaml_does_not_crash_with_job_model(self, tmp_path, monkeypatch):
"""A malformed config.yaml degrades gracefully when the job has a model."""
(tmp_path / "config.yaml").write_text("{{{invalid yaml!!!")
monkeypatch.delenv("HERMES_MODEL", raising=False)
job = {"id": "corrupt-job", "name": "corrupt", "prompt": "hi", "model": "explicit-model"}
fake_db = MagicMock()
with patch("cron.scheduler._hermes_home", tmp_path), \
patch("cron.scheduler._resolve_origin", return_value=None), \
patch("dotenv.load_dotenv"), \
patch("hermes_state.SessionDB", return_value=fake_db), \
patch("hermes_cli.runtime_provider.resolve_runtime_provider",
return_value=self._RUNTIME), \
patch("run_agent.AIAgent") as mock_agent_cls:
mock_agent = MagicMock()
mock_agent.run_conversation.return_value = {"final_response": "ok"}
mock_agent_cls.return_value = mock_agent
success, _, _, error = run_job(job)
# Explicit job model survives the corrupt-config fall-through.
assert success is True
assert error is None
assert mock_agent_cls.call_args.kwargs["model"] == "explicit-model"
class TestRunJobSkillBacked:
def test_run_job_preserves_skill_env_passthrough_into_worker_thread(self, tmp_path):
job = {