fix(cron): don't report a false 'gateway not running' on external-provider instances (#54600)

`hermes cron status` (and the create/list 'gateway not running' nag)
judge whether cron will fire purely from the in-process ticker's
heartbeat file + a live gateway PID. That heuristic is correct for the
built-in ticker but WRONG for an external provider like Chronos:

Chronos arms exactly one external one-shot per job and is fired by a
NAS-mediated webhook (POST /api/cron/fire). Its `start()` returns
immediately and it deliberately runs no 60s loop and writes no ticker
heartbeat — that's the whole point of scale-to-zero (the machine is at
zero between fires). So on a perfectly healthy Chronos instance,
`cron status` always printed '✗ Gateway is not running — cron jobs will
NOT fire' (or a STALLED-ticker warning), and `cron create` always
appended the 'jobs won't fire automatically' nag — both false.

Verified live on a staging Chronos instance: jobs fired and completed on
schedule via the relay while `cron status` insisted the gateway wasn't
running and the heartbeat was 370s+ stale.

Fix: resolve the active provider (offline — `resolve_cron_scheduler`,
whose `is_available()` contract forbids network) and, for any non-builtin
provider, report the managed-scheduler state instead of the ticker
heuristics, and suppress the ticker-only 'gateway not running' warning.
The built-in path is byte-unchanged. Active-job summary is factored into
a shared helper so both paths print it identically.

New tests prove both directions (chronos: no false negative even with no
gateway PID / no heartbeat; builtin: historical warning preserved) and
fail without the fix.
This commit is contained in:
Ben Barclay 2026-06-29 14:03:02 +10:00 committed by GitHub
parent e20ff352b9
commit 0943e2a272
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 126 additions and 3 deletions

View file

@ -57,6 +57,21 @@ def _cron_api(**kwargs):
return json.loads(cronjob_tool(**kwargs))
def _active_cron_provider_name() -> str:
"""Name of the resolved cron scheduler provider ('builtin', 'chronos', …).
Best-effort + offline (``resolve_cron_scheduler`` reads config and the
provider's ``is_available()`` contract forbids network). Returns 'builtin'
on any failure so callers fall back to the historical ticker-based checks.
"""
try:
from cron.scheduler_provider import resolve_cron_scheduler
return resolve_cron_scheduler().name or "builtin"
except Exception:
return "builtin"
def _warn_if_gateway_not_running() -> None:
"""Warn that scheduled jobs won't fire unless the gateway is running.
@ -65,8 +80,17 @@ def _warn_if_gateway_not_running() -> None:
gateway, ``next_run_at`` passes but jobs never fire and ``last_run_at``
stays null the most common cron support report (#51038). Surfacing this
at create/list time, when the user is right there, prevents it.
An external provider (e.g. Chronos) fires jobs via a NAS-mediated webhook,
NOT the in-process ticker, so a momentarily-absent gateway process does not
mean jobs won't fire — the warning would be a false alarm. Stay quiet for
any non-builtin provider; the gateway-process heuristic only speaks to the
built-in ticker's trigger.
"""
try:
if _active_cron_provider_name() != "builtin":
return
from hermes_cli.gateway import find_gateway_pids
if find_gateway_pids():
@ -177,6 +201,30 @@ def cron_status():
print()
provider = _active_cron_provider_name()
if provider != "builtin":
# An external provider (e.g. Chronos) does NOT run the in-process 60s
# ticker — it arms one external one-shot per job and is fired by a
# NAS-mediated webhook, so between fires there is intentionally NO
# ticker thread and NO heartbeat file. Reporting the ticker-heartbeat
# staleness here would always say "stalled / not firing" on a perfectly
# healthy Chronos instance. Report the provider instead and skip the
# ticker-liveness heuristics entirely.
print(color(
f"✓ Cron provider: {provider} — jobs fire via the managed scheduler, "
"not the in-process ticker.",
Colors.GREEN,
))
print(color(
" (No ticker heartbeat is expected for an external provider; "
"due jobs are delivered by an authenticated webhook.)",
Colors.DIM,
))
print()
_print_active_jobs_summary(list_jobs(include_disabled=False))
print()
return
pids = find_gateway_pids()
if pids:
# The gateway PROCESS is alive — but the cron ticker THREAD inside it
@ -231,7 +279,14 @@ def cron_status():
print()
jobs = list_jobs(include_disabled=False)
_print_active_jobs_summary(list_jobs(include_disabled=False))
print()
def _print_active_jobs_summary(jobs) -> None:
"""Print the '<N> active job(s)' + next-run line shared by every status
path (built-in ticker AND external provider)."""
if jobs:
next_runs = [j.get("next_run_at") for j in jobs if j.get("next_run_at")]
print(f" {len(jobs)} active job(s)")
@ -240,8 +295,6 @@ def cron_status():
else:
print(" No active jobs")
print()
def cron_create(args):
# Defense: reject cron jobs that contain gateway lifecycle commands.

View file

@ -178,3 +178,73 @@ class TestGatewayNotRunningWarning:
cron_command(Namespace(cron_command="list", all=True))
out = capsys.readouterr().out
assert "Gateway is not running" in out
class TestExternalCronProviderStatus:
"""With an external cron provider (e.g. Chronos), jobs fire via a
NAS-mediated webhook, NOT the in-process ticker. The ticker-heartbeat /
gateway-process heuristics are meaningless there, so neither
`cron status` nor the create/list warning must claim the gateway being
absent means jobs won't fire — that was a false-negative on every healthy
Chronos instance (the heartbeat is intentionally never written).
"""
def test_status_reports_provider_not_ticker_for_chronos(
self, tmp_cron_dir, capsys, monkeypatch
):
create_job(prompt="Ping", schedule="every 2m")
monkeypatch.setattr(
"hermes_cli.cron._active_cron_provider_name", lambda: "chronos"
)
# Even with NO gateway process and NO ticker heartbeat, Chronos status
# must NOT report a stall / "not firing".
monkeypatch.setattr("hermes_cli.gateway.find_gateway_pids", lambda: [])
cron_command(Namespace(cron_command="status"))
out = capsys.readouterr().out
assert "chronos" in out
assert "managed scheduler" in out
assert "not firing" not in out.lower()
assert "STALLED" not in out
assert "Gateway is not running" not in out
# Still surfaces the active-job summary.
assert "active job(s)" in out
def test_status_unchanged_for_builtin(self, tmp_cron_dir, capsys, monkeypatch):
create_job(prompt="Ping", schedule="every 2m")
monkeypatch.setattr(
"hermes_cli.cron._active_cron_provider_name", lambda: "builtin"
)
monkeypatch.setattr("hermes_cli.gateway.find_gateway_pids", lambda: [])
cron_command(Namespace(cron_command="status"))
out = capsys.readouterr().out
# Built-in path is the historical ticker-based report.
assert "Gateway is not running" in out
assert "managed scheduler" not in out
def test_create_silent_for_chronos_even_without_gateway(
self, tmp_cron_dir, capsys, monkeypatch
):
# The create-time "gateway not running" nag is a ticker-only concern;
# an external provider doesn't depend on a live in-process ticker.
monkeypatch.setattr(
"hermes_cli.cron._active_cron_provider_name", lambda: "chronos"
)
monkeypatch.setattr("hermes_cli.gateway.find_gateway_pids", lambda: [])
cron_command(
Namespace(
cron_command="create",
schedule="every 2m",
prompt="Ping",
name="Ping",
deliver=None,
repeat=None,
skill=None,
skills=None,
script=None,
workdir=None,
no_agent=False,
)
)
out = capsys.readouterr().out
assert "Created job" in out
assert "Gateway is not running" not in out