hermes-agent/plugins/cron/chronos/__init__.py
Ben 4c8bbe6416 feat(cron): Chronos NAS-mediated managed-cron provider (scale-to-zero)
Phase 4D. The first non-default CronScheduler: plugins/cron/chronos/. Inert
unless cron.provider=chronos; resolve_cron_scheduler falls back to the built-in
if unavailable, so cron never loses its trigger.

Files:
- chronos/__init__.py — ChronosCronScheduler + register(ctx).
  * is_available(): config-only, NO network (portal_url + callback_url + a
    stored Nous access token via get_provider_auth_state). Returns False →
    resolver falls back to built-in.
  * start(): reconcile() then RETURN — no blocking loop, no 60s wake (DQ-1:
    this is what makes scale-to-zero real; the machine wakes only on a
    NAS→agent fire).
  * _arm_one_shot(job): POST NAS provision {job_id, fire_at, agent_callback_url,
    dedup_key=job_id:fire_at}. Agent owns the time → sub-minute fires survive
    (no scheduler 1-minute floor).
  * reconcile(): converge NAS arms toward jobs.json — arm missing/changed-time,
    cancel orphaned, skip paused. Cold process rebuilds from jobs.json +
    idempotent dedup_key.
  * on_jobs_changed(): reconcile (re-arm/cancel the affected one-shot).
  * fire_due(): ABC default (CAS claim + run_one_job) THEN re-arm the next
    one-shot. Job gone (one-shot done / repeat-N exhausted) → no re-arm.
- chronos/_nas_client.py — thin HTTP wrapper for provision/cancel/list using
  the agent's existing refresh-aware Nous token (resolve_nous_access_token).
  Names no scheduler vendor; holds no scheduler creds.
- chronos/plugin.yaml — discovery metadata.

INVARIANT: zero "qstash"/"upstash" hits in plugins/cron, gateway, hermes_cli,
website/docs — the external scheduler is a NAS-internal detail, never named
agent-side.

Tests (13, all NAS mocked, zero network): is_available off-without-config +
on-with-config + makes-no-network; arm payload incl. sub-minute + noop without
next_run; reconcile arms-all / cancels-orphan / skips-paused / skips-already-
armed; fire_due re-arms next / no re-arm when job gone / no re-arm when claim
lost.
2026-06-18 14:40:56 +10:00

241 lines
9.3 KiB
Python

"""Chronos — NAS-mediated managed cron provider (scale-to-zero).
Chronos (the Greek god of time, alongside Hermes) is the first non-default
``CronScheduler``. It lets a hosted gateway scale to zero while idle and still
fire cron jobs: instead of a 60s in-process ticker, it asks NAS to arm exactly
one external one-shot per job at that job's real next-fire time. NAS calls the
agent back at fire time over an authenticated webhook (``/api/cron/fire``); the
agent runs the job via the shared ``run_one_job`` body and re-arms the next
one-shot.
The external scheduler NAS uses is an internal NAS implementation detail —
Chronos names no vendor, holds no scheduler credentials, and speaks only to
NAS's ``agent-cron`` endpoints with the agent's existing Nous token.
Design constraints (see the plan's DQ-1):
- start() arms all enabled jobs and RETURNS; it never blocks and never spawns
a periodic wake. Between fires the machine is truly at zero.
- reconcile runs only on a warm process (start / on_jobs_changed / piggybacked
on a fire), never as a periodic wake of a sleeping machine.
Inert unless ``cron.provider: chronos``. ``resolve_cron_scheduler`` falls back
to the built-in if Chronos is unavailable, so cron never loses its trigger.
Wire contract: ``docs/chronos-managed-cron-contract.md``.
"""
from __future__ import annotations
import logging
import threading
from typing import Any, Dict, Optional
from cron.scheduler_provider import CronScheduler
logger = logging.getLogger("cron.chronos")
def _cfg(*keys: str, default: Any = "") -> Any:
"""Read a cron.chronos.* config value (no network)."""
try:
from hermes_cli.config import cfg_get, load_config
return cfg_get(load_config(), *keys, default=default)
except Exception:
return default
class ChronosCronScheduler(CronScheduler):
"""NAS-mediated external cron provider."""
def __init__(self) -> None:
# In-memory map of job_id → fire_at we've asked NAS to arm. Best-effort
# cache; reconcile rebuilds desired state from jobs.json, so a cold
# process simply re-arms (idempotent via dedup_key).
self._armed: Dict[str, str] = {}
self._lock = threading.Lock()
self._client = None # lazily constructed (no network in is_available)
# -- identity / availability -----------------------------------------
@property
def name(self) -> str:
return "chronos"
def is_available(self) -> bool:
"""Config presence only — NO network.
Chronos needs a portal base URL, the agent's own publicly-reachable
callback URL (for NAS→agent fires), and a usable Nous token (the agent
is logged into the portal). If any is missing, resolve_cron_scheduler
falls back to the built-in ticker.
"""
if not (_cfg("cron", "chronos", "portal_url") and _cfg("cron", "chronos", "callback_url")):
return False
return self._have_nous_token()
def _have_nous_token(self) -> bool:
"""True if the agent has a Nous Portal login (no network call).
Checks the stored auth state for a Nous access token — does NOT refresh
or hit the network (is_available must stay offline). The actual
refresh-aware token is resolved lazily at provision time.
"""
try:
from hermes_cli.auth import get_provider_auth_state
state = get_provider_auth_state("nous") or {}
return bool(state.get("access_token"))
except Exception:
return False
# -- client -----------------------------------------------------------
def _get_client(self):
if self._client is None:
from ._nas_client import NasCronClient
self._client = NasCronClient(_cfg("cron", "chronos", "portal_url"))
return self._client
def _callback_url(self) -> str:
return str(_cfg("cron", "chronos", "callback_url") or "")
# -- lifecycle --------------------------------------------------------
def start(self, stop_event, *, adapters=None, loop=None, interval=60):
"""Arm all enabled jobs via NAS, then RETURN immediately.
Does NOT block and does NOT spawn a 60s wake (DQ-1) — that is the whole
point of scale-to-zero. The machine wakes only on a NAS→agent fire.
"""
try:
self.reconcile()
except Exception as e:
logger.warning("Chronos start() reconcile failed: %s", e)
# Intentionally return — no loop, no periodic wake.
def stop(self) -> None:
return None
def on_jobs_changed(self) -> None:
"""A job was created/updated/removed/paused/resumed — reconcile the NAS
registry so the affected one-shot is (re-)armed or cancelled."""
try:
self.reconcile()
except Exception as e:
logger.debug("Chronos on_jobs_changed reconcile failed: %s", e)
# -- arming -----------------------------------------------------------
def _arm_one_shot(self, job: Dict[str, Any]) -> None:
"""Ask NAS to arm exactly one one-shot at the job's next_run_at.
The agent computes the time; NAS+its scheduler are the dumb executor.
Idempotent per (job_id, fire_at) via dedup_key, so re-arming the same
fire is a no-op NAS-side.
"""
job_id = job["id"]
fire_at = job.get("next_run_at")
if not fire_at:
return
dedup_key = f"{job_id}:{fire_at}"
self._get_client().provision(
job_id=job_id,
fire_at=fire_at,
agent_callback_url=self._callback_url(),
dedup_key=dedup_key,
)
with self._lock:
self._armed[job_id] = fire_at
def _cancel(self, job_id: str) -> None:
try:
self._get_client().cancel(job_id=job_id)
finally:
with self._lock:
self._armed.pop(job_id, None)
def _list_armed(self) -> Dict[str, str]:
"""Observed armed one-shots: job_id → fire_at.
Prefer the in-memory map (warm process); on a cold/empty map, ask NAS
(best-effort). If NAS list fails, return what we have — reconcile then
re-arms desired jobs idempotently.
"""
with self._lock:
if self._armed:
return dict(self._armed)
try:
observed = {
item["job_id"]: item.get("fire_at", "")
for item in self._get_client().list_armed()
if item.get("job_id")
}
with self._lock:
self._armed.update(observed)
return observed
except Exception as e:
logger.debug("Chronos _list_armed failed (will re-arm idempotently): %s", e)
return {}
# -- reconcile --------------------------------------------------------
def reconcile(self) -> None:
"""Converge the NAS-armed one-shots toward jobs.json (desired state):
arm missing / re-arm changed-time, cancel orphaned."""
from cron.jobs import load_jobs
desired: Dict[str, str] = {
j["id"]: j["next_run_at"]
for j in load_jobs()
if j.get("enabled") and j.get("next_run_at") and j.get("state") != "paused"
}
observed = self._list_armed()
# Arm missing or changed-time.
for job_id, fire_at in desired.items():
if observed.get(job_id) != fire_at:
# Re-fetch the full job dict to arm (need the whole record).
from cron.jobs import get_job
job = get_job(job_id)
if job:
try:
self._arm_one_shot(job)
except Exception as e:
logger.warning("Chronos failed to arm job %s: %s", job_id, e)
# Cancel orphans (armed but no longer desired).
for job_id in list(observed.keys()):
if job_id not in desired:
try:
self._cancel(job_id)
except Exception as e:
logger.warning("Chronos failed to cancel orphan %s: %s", job_id, e)
# -- fire -------------------------------------------------------------
def fire_due(self, job_id: str, *, adapters: Any = None, loop: Any = None) -> bool:
"""Run the due job (claim + run_one_job via the ABC default), then
re-arm the NEXT one-shot through NAS.
Re-arm happens AFTER the run so next_run_at reflects the completed fire.
If the job is gone (one-shot completed / repeat-N exhausted), get_job
returns None → nothing to re-arm (the schedule naturally stops).
"""
ran = super().fire_due(job_id, adapters=adapters, loop=loop)
if ran:
from cron.jobs import get_job
job = get_job(job_id)
if job and job.get("enabled") and job.get("next_run_at"):
try:
self._arm_one_shot(job)
except Exception as e:
logger.warning("Chronos failed to re-arm job %s after fire: %s", job_id, e)
return ran
def register(ctx) -> None:
"""Plugin entrypoint — register the Chronos provider with the loader.
Mirrors the memory-plugin shape; plugins/cron discovery calls this and
collects the provider via register_cron_scheduler.
"""
ctx.register_cron_scheduler(ChronosCronScheduler())