From 0943e2a2720fd2b7eb5aee19c3bad0d495e5450a Mon Sep 17 00:00:00 2001 From: Ben Barclay Date: Mon, 29 Jun 2026 14:03:02 +1000 Subject: [PATCH] fix(cron): don't report a false 'gateway not running' on external-provider instances (#54600) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `hermes cron status` (and the create/list 'gateway not running' nag) judge whether cron will fire purely from the in-process ticker's heartbeat file + a live gateway PID. That heuristic is correct for the built-in ticker but WRONG for an external provider like Chronos: Chronos arms exactly one external one-shot per job and is fired by a NAS-mediated webhook (POST /api/cron/fire). Its `start()` returns immediately and it deliberately runs no 60s loop and writes no ticker heartbeat — that's the whole point of scale-to-zero (the machine is at zero between fires). So on a perfectly healthy Chronos instance, `cron status` always printed '✗ Gateway is not running — cron jobs will NOT fire' (or a STALLED-ticker warning), and `cron create` always appended the 'jobs won't fire automatically' nag — both false. Verified live on a staging Chronos instance: jobs fired and completed on schedule via the relay while `cron status` insisted the gateway wasn't running and the heartbeat was 370s+ stale. Fix: resolve the active provider (offline — `resolve_cron_scheduler`, whose `is_available()` contract forbids network) and, for any non-builtin provider, report the managed-scheduler state instead of the ticker heuristics, and suppress the ticker-only 'gateway not running' warning. The built-in path is byte-unchanged. Active-job summary is factored into a shared helper so both paths print it identically. New tests prove both directions (chronos: no false negative even with no gateway PID / no heartbeat; builtin: historical warning preserved) and fail without the fix. --- hermes_cli/cron.py | 59 +++++++++++++++++++++++++++-- tests/hermes_cli/test_cron.py | 70 +++++++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+), 3 deletions(-) diff --git a/hermes_cli/cron.py b/hermes_cli/cron.py index 0f3b5a272b3..1f806050ad9 100644 --- a/hermes_cli/cron.py +++ b/hermes_cli/cron.py @@ -57,6 +57,21 @@ def _cron_api(**kwargs): return json.loads(cronjob_tool(**kwargs)) +def _active_cron_provider_name() -> str: + """Name of the resolved cron scheduler provider ('builtin', 'chronos', …). + + Best-effort + offline (``resolve_cron_scheduler`` reads config and the + provider's ``is_available()`` contract forbids network). Returns 'builtin' + on any failure so callers fall back to the historical ticker-based checks. + """ + try: + from cron.scheduler_provider import resolve_cron_scheduler + + return resolve_cron_scheduler().name or "builtin" + except Exception: + return "builtin" + + def _warn_if_gateway_not_running() -> None: """Warn that scheduled jobs won't fire unless the gateway is running. @@ -65,8 +80,17 @@ def _warn_if_gateway_not_running() -> None: gateway, ``next_run_at`` passes but jobs never fire and ``last_run_at`` stays null — the most common cron support report (#51038). Surfacing this at create/list time, when the user is right there, prevents it. + + An external provider (e.g. Chronos) fires jobs via a NAS-mediated webhook, + NOT the in-process ticker, so a momentarily-absent gateway process does not + mean jobs won't fire — the warning would be a false alarm. Stay quiet for + any non-builtin provider; the gateway-process heuristic only speaks to the + built-in ticker's trigger. """ try: + if _active_cron_provider_name() != "builtin": + return + from hermes_cli.gateway import find_gateway_pids if find_gateway_pids(): @@ -177,6 +201,30 @@ def cron_status(): print() + provider = _active_cron_provider_name() + if provider != "builtin": + # An external provider (e.g. Chronos) does NOT run the in-process 60s + # ticker — it arms one external one-shot per job and is fired by a + # NAS-mediated webhook, so between fires there is intentionally NO + # ticker thread and NO heartbeat file. Reporting the ticker-heartbeat + # staleness here would always say "stalled / not firing" on a perfectly + # healthy Chronos instance. Report the provider instead and skip the + # ticker-liveness heuristics entirely. + print(color( + f"✓ Cron provider: {provider} — jobs fire via the managed scheduler, " + "not the in-process ticker.", + Colors.GREEN, + )) + print(color( + " (No ticker heartbeat is expected for an external provider; " + "due jobs are delivered by an authenticated webhook.)", + Colors.DIM, + )) + print() + _print_active_jobs_summary(list_jobs(include_disabled=False)) + print() + return + pids = find_gateway_pids() if pids: # The gateway PROCESS is alive — but the cron ticker THREAD inside it @@ -231,7 +279,14 @@ def cron_status(): print() - jobs = list_jobs(include_disabled=False) + _print_active_jobs_summary(list_jobs(include_disabled=False)) + + print() + + +def _print_active_jobs_summary(jobs) -> None: + """Print the ' active job(s)' + next-run line shared by every status + path (built-in ticker AND external provider).""" if jobs: next_runs = [j.get("next_run_at") for j in jobs if j.get("next_run_at")] print(f" {len(jobs)} active job(s)") @@ -240,8 +295,6 @@ def cron_status(): else: print(" No active jobs") - print() - def cron_create(args): # Defense: reject cron jobs that contain gateway lifecycle commands. diff --git a/tests/hermes_cli/test_cron.py b/tests/hermes_cli/test_cron.py index 1281589048b..8cd7ef39659 100644 --- a/tests/hermes_cli/test_cron.py +++ b/tests/hermes_cli/test_cron.py @@ -178,3 +178,73 @@ class TestGatewayNotRunningWarning: cron_command(Namespace(cron_command="list", all=True)) out = capsys.readouterr().out assert "Gateway is not running" in out + + +class TestExternalCronProviderStatus: + """With an external cron provider (e.g. Chronos), jobs fire via a + NAS-mediated webhook, NOT the in-process ticker. The ticker-heartbeat / + gateway-process heuristics are meaningless there, so neither + `cron status` nor the create/list warning must claim the gateway being + absent means jobs won't fire — that was a false-negative on every healthy + Chronos instance (the heartbeat is intentionally never written). + """ + + def test_status_reports_provider_not_ticker_for_chronos( + self, tmp_cron_dir, capsys, monkeypatch + ): + create_job(prompt="Ping", schedule="every 2m") + monkeypatch.setattr( + "hermes_cli.cron._active_cron_provider_name", lambda: "chronos" + ) + # Even with NO gateway process and NO ticker heartbeat, Chronos status + # must NOT report a stall / "not firing". + monkeypatch.setattr("hermes_cli.gateway.find_gateway_pids", lambda: []) + cron_command(Namespace(cron_command="status")) + out = capsys.readouterr().out + assert "chronos" in out + assert "managed scheduler" in out + assert "not firing" not in out.lower() + assert "STALLED" not in out + assert "Gateway is not running" not in out + # Still surfaces the active-job summary. + assert "active job(s)" in out + + def test_status_unchanged_for_builtin(self, tmp_cron_dir, capsys, monkeypatch): + create_job(prompt="Ping", schedule="every 2m") + monkeypatch.setattr( + "hermes_cli.cron._active_cron_provider_name", lambda: "builtin" + ) + monkeypatch.setattr("hermes_cli.gateway.find_gateway_pids", lambda: []) + cron_command(Namespace(cron_command="status")) + out = capsys.readouterr().out + # Built-in path is the historical ticker-based report. + assert "Gateway is not running" in out + assert "managed scheduler" not in out + + def test_create_silent_for_chronos_even_without_gateway( + self, tmp_cron_dir, capsys, monkeypatch + ): + # The create-time "gateway not running" nag is a ticker-only concern; + # an external provider doesn't depend on a live in-process ticker. + monkeypatch.setattr( + "hermes_cli.cron._active_cron_provider_name", lambda: "chronos" + ) + monkeypatch.setattr("hermes_cli.gateway.find_gateway_pids", lambda: []) + cron_command( + Namespace( + cron_command="create", + schedule="every 2m", + prompt="Ping", + name="Ping", + deliver=None, + repeat=None, + skill=None, + skills=None, + script=None, + workdir=None, + no_agent=False, + ) + ) + out = capsys.readouterr().out + assert "Created job" in out + assert "Gateway is not running" not in out