mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
fix(cron): don't report a false 'gateway not running' on external-provider instances (#54600)
`hermes cron status` (and the create/list 'gateway not running' nag) judge whether cron will fire purely from the in-process ticker's heartbeat file + a live gateway PID. That heuristic is correct for the built-in ticker but WRONG for an external provider like Chronos: Chronos arms exactly one external one-shot per job and is fired by a NAS-mediated webhook (POST /api/cron/fire). Its `start()` returns immediately and it deliberately runs no 60s loop and writes no ticker heartbeat — that's the whole point of scale-to-zero (the machine is at zero between fires). So on a perfectly healthy Chronos instance, `cron status` always printed '✗ Gateway is not running — cron jobs will NOT fire' (or a STALLED-ticker warning), and `cron create` always appended the 'jobs won't fire automatically' nag — both false. Verified live on a staging Chronos instance: jobs fired and completed on schedule via the relay while `cron status` insisted the gateway wasn't running and the heartbeat was 370s+ stale. Fix: resolve the active provider (offline — `resolve_cron_scheduler`, whose `is_available()` contract forbids network) and, for any non-builtin provider, report the managed-scheduler state instead of the ticker heuristics, and suppress the ticker-only 'gateway not running' warning. The built-in path is byte-unchanged. Active-job summary is factored into a shared helper so both paths print it identically. New tests prove both directions (chronos: no false negative even with no gateway PID / no heartbeat; builtin: historical warning preserved) and fail without the fix.
This commit is contained in:
parent
e20ff352b9
commit
0943e2a272
2 changed files with 126 additions and 3 deletions
|
|
@ -57,6 +57,21 @@ def _cron_api(**kwargs):
|
|||
return json.loads(cronjob_tool(**kwargs))
|
||||
|
||||
|
||||
def _active_cron_provider_name() -> str:
|
||||
"""Name of the resolved cron scheduler provider ('builtin', 'chronos', …).
|
||||
|
||||
Best-effort + offline (``resolve_cron_scheduler`` reads config and the
|
||||
provider's ``is_available()`` contract forbids network). Returns 'builtin'
|
||||
on any failure so callers fall back to the historical ticker-based checks.
|
||||
"""
|
||||
try:
|
||||
from cron.scheduler_provider import resolve_cron_scheduler
|
||||
|
||||
return resolve_cron_scheduler().name or "builtin"
|
||||
except Exception:
|
||||
return "builtin"
|
||||
|
||||
|
||||
def _warn_if_gateway_not_running() -> None:
|
||||
"""Warn that scheduled jobs won't fire unless the gateway is running.
|
||||
|
||||
|
|
@ -65,8 +80,17 @@ def _warn_if_gateway_not_running() -> None:
|
|||
gateway, ``next_run_at`` passes but jobs never fire and ``last_run_at``
|
||||
stays null — the most common cron support report (#51038). Surfacing this
|
||||
at create/list time, when the user is right there, prevents it.
|
||||
|
||||
An external provider (e.g. Chronos) fires jobs via a NAS-mediated webhook,
|
||||
NOT the in-process ticker, so a momentarily-absent gateway process does not
|
||||
mean jobs won't fire — the warning would be a false alarm. Stay quiet for
|
||||
any non-builtin provider; the gateway-process heuristic only speaks to the
|
||||
built-in ticker's trigger.
|
||||
"""
|
||||
try:
|
||||
if _active_cron_provider_name() != "builtin":
|
||||
return
|
||||
|
||||
from hermes_cli.gateway import find_gateway_pids
|
||||
|
||||
if find_gateway_pids():
|
||||
|
|
@ -177,6 +201,30 @@ def cron_status():
|
|||
|
||||
print()
|
||||
|
||||
provider = _active_cron_provider_name()
|
||||
if provider != "builtin":
|
||||
# An external provider (e.g. Chronos) does NOT run the in-process 60s
|
||||
# ticker — it arms one external one-shot per job and is fired by a
|
||||
# NAS-mediated webhook, so between fires there is intentionally NO
|
||||
# ticker thread and NO heartbeat file. Reporting the ticker-heartbeat
|
||||
# staleness here would always say "stalled / not firing" on a perfectly
|
||||
# healthy Chronos instance. Report the provider instead and skip the
|
||||
# ticker-liveness heuristics entirely.
|
||||
print(color(
|
||||
f"✓ Cron provider: {provider} — jobs fire via the managed scheduler, "
|
||||
"not the in-process ticker.",
|
||||
Colors.GREEN,
|
||||
))
|
||||
print(color(
|
||||
" (No ticker heartbeat is expected for an external provider; "
|
||||
"due jobs are delivered by an authenticated webhook.)",
|
||||
Colors.DIM,
|
||||
))
|
||||
print()
|
||||
_print_active_jobs_summary(list_jobs(include_disabled=False))
|
||||
print()
|
||||
return
|
||||
|
||||
pids = find_gateway_pids()
|
||||
if pids:
|
||||
# The gateway PROCESS is alive — but the cron ticker THREAD inside it
|
||||
|
|
@ -231,7 +279,14 @@ def cron_status():
|
|||
|
||||
print()
|
||||
|
||||
jobs = list_jobs(include_disabled=False)
|
||||
_print_active_jobs_summary(list_jobs(include_disabled=False))
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def _print_active_jobs_summary(jobs) -> None:
|
||||
"""Print the '<N> active job(s)' + next-run line shared by every status
|
||||
path (built-in ticker AND external provider)."""
|
||||
if jobs:
|
||||
next_runs = [j.get("next_run_at") for j in jobs if j.get("next_run_at")]
|
||||
print(f" {len(jobs)} active job(s)")
|
||||
|
|
@ -240,8 +295,6 @@ def cron_status():
|
|||
else:
|
||||
print(" No active jobs")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def cron_create(args):
|
||||
# Defense: reject cron jobs that contain gateway lifecycle commands.
|
||||
|
|
|
|||
|
|
@ -178,3 +178,73 @@ class TestGatewayNotRunningWarning:
|
|||
cron_command(Namespace(cron_command="list", all=True))
|
||||
out = capsys.readouterr().out
|
||||
assert "Gateway is not running" in out
|
||||
|
||||
|
||||
class TestExternalCronProviderStatus:
|
||||
"""With an external cron provider (e.g. Chronos), jobs fire via a
|
||||
NAS-mediated webhook, NOT the in-process ticker. The ticker-heartbeat /
|
||||
gateway-process heuristics are meaningless there, so neither
|
||||
`cron status` nor the create/list warning must claim the gateway being
|
||||
absent means jobs won't fire — that was a false-negative on every healthy
|
||||
Chronos instance (the heartbeat is intentionally never written).
|
||||
"""
|
||||
|
||||
def test_status_reports_provider_not_ticker_for_chronos(
|
||||
self, tmp_cron_dir, capsys, monkeypatch
|
||||
):
|
||||
create_job(prompt="Ping", schedule="every 2m")
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.cron._active_cron_provider_name", lambda: "chronos"
|
||||
)
|
||||
# Even with NO gateway process and NO ticker heartbeat, Chronos status
|
||||
# must NOT report a stall / "not firing".
|
||||
monkeypatch.setattr("hermes_cli.gateway.find_gateway_pids", lambda: [])
|
||||
cron_command(Namespace(cron_command="status"))
|
||||
out = capsys.readouterr().out
|
||||
assert "chronos" in out
|
||||
assert "managed scheduler" in out
|
||||
assert "not firing" not in out.lower()
|
||||
assert "STALLED" not in out
|
||||
assert "Gateway is not running" not in out
|
||||
# Still surfaces the active-job summary.
|
||||
assert "active job(s)" in out
|
||||
|
||||
def test_status_unchanged_for_builtin(self, tmp_cron_dir, capsys, monkeypatch):
|
||||
create_job(prompt="Ping", schedule="every 2m")
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.cron._active_cron_provider_name", lambda: "builtin"
|
||||
)
|
||||
monkeypatch.setattr("hermes_cli.gateway.find_gateway_pids", lambda: [])
|
||||
cron_command(Namespace(cron_command="status"))
|
||||
out = capsys.readouterr().out
|
||||
# Built-in path is the historical ticker-based report.
|
||||
assert "Gateway is not running" in out
|
||||
assert "managed scheduler" not in out
|
||||
|
||||
def test_create_silent_for_chronos_even_without_gateway(
|
||||
self, tmp_cron_dir, capsys, monkeypatch
|
||||
):
|
||||
# The create-time "gateway not running" nag is a ticker-only concern;
|
||||
# an external provider doesn't depend on a live in-process ticker.
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.cron._active_cron_provider_name", lambda: "chronos"
|
||||
)
|
||||
monkeypatch.setattr("hermes_cli.gateway.find_gateway_pids", lambda: [])
|
||||
cron_command(
|
||||
Namespace(
|
||||
cron_command="create",
|
||||
schedule="every 2m",
|
||||
prompt="Ping",
|
||||
name="Ping",
|
||||
deliver=None,
|
||||
repeat=None,
|
||||
skill=None,
|
||||
skills=None,
|
||||
script=None,
|
||||
workdir=None,
|
||||
no_agent=False,
|
||||
)
|
||||
)
|
||||
out = capsys.readouterr().out
|
||||
assert "Created job" in out
|
||||
assert "Gateway is not running" not in out
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue