mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-21 10:22:18 +00:00
Phase 4D. The first non-default CronScheduler: plugins/cron/chronos/. Inert
unless cron.provider=chronos; resolve_cron_scheduler falls back to the built-in
if unavailable, so cron never loses its trigger.
Files:
- chronos/__init__.py — ChronosCronScheduler + register(ctx).
* is_available(): config-only, NO network (portal_url + callback_url + a
stored Nous access token via get_provider_auth_state). Returns False →
resolver falls back to built-in.
* start(): reconcile() then RETURN — no blocking loop, no 60s wake (DQ-1:
this is what makes scale-to-zero real; the machine wakes only on a
NAS→agent fire).
* _arm_one_shot(job): POST NAS provision {job_id, fire_at, agent_callback_url,
dedup_key=job_id:fire_at}. Agent owns the time → sub-minute fires survive
(no scheduler 1-minute floor).
* reconcile(): converge NAS arms toward jobs.json — arm missing/changed-time,
cancel orphaned, skip paused. Cold process rebuilds from jobs.json +
idempotent dedup_key.
* on_jobs_changed(): reconcile (re-arm/cancel the affected one-shot).
* fire_due(): ABC default (CAS claim + run_one_job) THEN re-arm the next
one-shot. Job gone (one-shot done / repeat-N exhausted) → no re-arm.
- chronos/_nas_client.py — thin HTTP wrapper for provision/cancel/list using
the agent's existing refresh-aware Nous token (resolve_nous_access_token).
Names no scheduler vendor; holds no scheduler creds.
- chronos/plugin.yaml — discovery metadata.
INVARIANT: zero "qstash"/"upstash" hits in plugins/cron, gateway, hermes_cli,
website/docs — the external scheduler is a NAS-internal detail, never named
agent-side.
Tests (13, all NAS mocked, zero network): is_available off-without-config +
on-with-config + makes-no-network; arm payload incl. sub-minute + noop without
next_run; reconcile arms-all / cancels-orphan / skips-paused / skips-already-
armed; fire_due re-arms next / no re-arm when job gone / no re-arm when claim
lost.
241 lines
9.3 KiB
Python
241 lines
9.3 KiB
Python
"""Chronos — NAS-mediated managed cron provider (scale-to-zero).
|
|
|
|
Chronos (the Greek god of time, alongside Hermes) is the first non-default
|
|
``CronScheduler``. It lets a hosted gateway scale to zero while idle and still
|
|
fire cron jobs: instead of a 60s in-process ticker, it asks NAS to arm exactly
|
|
one external one-shot per job at that job's real next-fire time. NAS calls the
|
|
agent back at fire time over an authenticated webhook (``/api/cron/fire``); the
|
|
agent runs the job via the shared ``run_one_job`` body and re-arms the next
|
|
one-shot.
|
|
|
|
The external scheduler NAS uses is an internal NAS implementation detail —
|
|
Chronos names no vendor, holds no scheduler credentials, and speaks only to
|
|
NAS's ``agent-cron`` endpoints with the agent's existing Nous token.
|
|
|
|
Design constraints (see the plan's DQ-1):
|
|
- start() arms all enabled jobs and RETURNS; it never blocks and never spawns
|
|
a periodic wake. Between fires the machine is truly at zero.
|
|
- reconcile runs only on a warm process (start / on_jobs_changed / piggybacked
|
|
on a fire), never as a periodic wake of a sleeping machine.
|
|
|
|
Inert unless ``cron.provider: chronos``. ``resolve_cron_scheduler`` falls back
|
|
to the built-in if Chronos is unavailable, so cron never loses its trigger.
|
|
|
|
Wire contract: ``docs/chronos-managed-cron-contract.md``.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import threading
|
|
from typing import Any, Dict, Optional
|
|
|
|
from cron.scheduler_provider import CronScheduler
|
|
|
|
logger = logging.getLogger("cron.chronos")
|
|
|
|
|
|
def _cfg(*keys: str, default: Any = "") -> Any:
|
|
"""Read a cron.chronos.* config value (no network)."""
|
|
try:
|
|
from hermes_cli.config import cfg_get, load_config
|
|
return cfg_get(load_config(), *keys, default=default)
|
|
except Exception:
|
|
return default
|
|
|
|
|
|
class ChronosCronScheduler(CronScheduler):
|
|
"""NAS-mediated external cron provider."""
|
|
|
|
def __init__(self) -> None:
|
|
# In-memory map of job_id → fire_at we've asked NAS to arm. Best-effort
|
|
# cache; reconcile rebuilds desired state from jobs.json, so a cold
|
|
# process simply re-arms (idempotent via dedup_key).
|
|
self._armed: Dict[str, str] = {}
|
|
self._lock = threading.Lock()
|
|
self._client = None # lazily constructed (no network in is_available)
|
|
|
|
# -- identity / availability -----------------------------------------
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "chronos"
|
|
|
|
def is_available(self) -> bool:
|
|
"""Config presence only — NO network.
|
|
|
|
Chronos needs a portal base URL, the agent's own publicly-reachable
|
|
callback URL (for NAS→agent fires), and a usable Nous token (the agent
|
|
is logged into the portal). If any is missing, resolve_cron_scheduler
|
|
falls back to the built-in ticker.
|
|
"""
|
|
if not (_cfg("cron", "chronos", "portal_url") and _cfg("cron", "chronos", "callback_url")):
|
|
return False
|
|
return self._have_nous_token()
|
|
|
|
def _have_nous_token(self) -> bool:
|
|
"""True if the agent has a Nous Portal login (no network call).
|
|
|
|
Checks the stored auth state for a Nous access token — does NOT refresh
|
|
or hit the network (is_available must stay offline). The actual
|
|
refresh-aware token is resolved lazily at provision time.
|
|
"""
|
|
try:
|
|
from hermes_cli.auth import get_provider_auth_state
|
|
state = get_provider_auth_state("nous") or {}
|
|
return bool(state.get("access_token"))
|
|
except Exception:
|
|
return False
|
|
|
|
# -- client -----------------------------------------------------------
|
|
|
|
def _get_client(self):
|
|
if self._client is None:
|
|
from ._nas_client import NasCronClient
|
|
self._client = NasCronClient(_cfg("cron", "chronos", "portal_url"))
|
|
return self._client
|
|
|
|
def _callback_url(self) -> str:
|
|
return str(_cfg("cron", "chronos", "callback_url") or "")
|
|
|
|
# -- lifecycle --------------------------------------------------------
|
|
|
|
def start(self, stop_event, *, adapters=None, loop=None, interval=60):
|
|
"""Arm all enabled jobs via NAS, then RETURN immediately.
|
|
|
|
Does NOT block and does NOT spawn a 60s wake (DQ-1) — that is the whole
|
|
point of scale-to-zero. The machine wakes only on a NAS→agent fire.
|
|
"""
|
|
try:
|
|
self.reconcile()
|
|
except Exception as e:
|
|
logger.warning("Chronos start() reconcile failed: %s", e)
|
|
# Intentionally return — no loop, no periodic wake.
|
|
|
|
def stop(self) -> None:
|
|
return None
|
|
|
|
def on_jobs_changed(self) -> None:
|
|
"""A job was created/updated/removed/paused/resumed — reconcile the NAS
|
|
registry so the affected one-shot is (re-)armed or cancelled."""
|
|
try:
|
|
self.reconcile()
|
|
except Exception as e:
|
|
logger.debug("Chronos on_jobs_changed reconcile failed: %s", e)
|
|
|
|
# -- arming -----------------------------------------------------------
|
|
|
|
def _arm_one_shot(self, job: Dict[str, Any]) -> None:
|
|
"""Ask NAS to arm exactly one one-shot at the job's next_run_at.
|
|
|
|
The agent computes the time; NAS+its scheduler are the dumb executor.
|
|
Idempotent per (job_id, fire_at) via dedup_key, so re-arming the same
|
|
fire is a no-op NAS-side.
|
|
"""
|
|
job_id = job["id"]
|
|
fire_at = job.get("next_run_at")
|
|
if not fire_at:
|
|
return
|
|
dedup_key = f"{job_id}:{fire_at}"
|
|
self._get_client().provision(
|
|
job_id=job_id,
|
|
fire_at=fire_at,
|
|
agent_callback_url=self._callback_url(),
|
|
dedup_key=dedup_key,
|
|
)
|
|
with self._lock:
|
|
self._armed[job_id] = fire_at
|
|
|
|
def _cancel(self, job_id: str) -> None:
|
|
try:
|
|
self._get_client().cancel(job_id=job_id)
|
|
finally:
|
|
with self._lock:
|
|
self._armed.pop(job_id, None)
|
|
|
|
def _list_armed(self) -> Dict[str, str]:
|
|
"""Observed armed one-shots: job_id → fire_at.
|
|
|
|
Prefer the in-memory map (warm process); on a cold/empty map, ask NAS
|
|
(best-effort). If NAS list fails, return what we have — reconcile then
|
|
re-arms desired jobs idempotently.
|
|
"""
|
|
with self._lock:
|
|
if self._armed:
|
|
return dict(self._armed)
|
|
try:
|
|
observed = {
|
|
item["job_id"]: item.get("fire_at", "")
|
|
for item in self._get_client().list_armed()
|
|
if item.get("job_id")
|
|
}
|
|
with self._lock:
|
|
self._armed.update(observed)
|
|
return observed
|
|
except Exception as e:
|
|
logger.debug("Chronos _list_armed failed (will re-arm idempotently): %s", e)
|
|
return {}
|
|
|
|
# -- reconcile --------------------------------------------------------
|
|
|
|
def reconcile(self) -> None:
|
|
"""Converge the NAS-armed one-shots toward jobs.json (desired state):
|
|
arm missing / re-arm changed-time, cancel orphaned."""
|
|
from cron.jobs import load_jobs
|
|
|
|
desired: Dict[str, str] = {
|
|
j["id"]: j["next_run_at"]
|
|
for j in load_jobs()
|
|
if j.get("enabled") and j.get("next_run_at") and j.get("state") != "paused"
|
|
}
|
|
observed = self._list_armed()
|
|
|
|
# Arm missing or changed-time.
|
|
for job_id, fire_at in desired.items():
|
|
if observed.get(job_id) != fire_at:
|
|
# Re-fetch the full job dict to arm (need the whole record).
|
|
from cron.jobs import get_job
|
|
job = get_job(job_id)
|
|
if job:
|
|
try:
|
|
self._arm_one_shot(job)
|
|
except Exception as e:
|
|
logger.warning("Chronos failed to arm job %s: %s", job_id, e)
|
|
|
|
# Cancel orphans (armed but no longer desired).
|
|
for job_id in list(observed.keys()):
|
|
if job_id not in desired:
|
|
try:
|
|
self._cancel(job_id)
|
|
except Exception as e:
|
|
logger.warning("Chronos failed to cancel orphan %s: %s", job_id, e)
|
|
|
|
# -- fire -------------------------------------------------------------
|
|
|
|
def fire_due(self, job_id: str, *, adapters: Any = None, loop: Any = None) -> bool:
|
|
"""Run the due job (claim + run_one_job via the ABC default), then
|
|
re-arm the NEXT one-shot through NAS.
|
|
|
|
Re-arm happens AFTER the run so next_run_at reflects the completed fire.
|
|
If the job is gone (one-shot completed / repeat-N exhausted), get_job
|
|
returns None → nothing to re-arm (the schedule naturally stops).
|
|
"""
|
|
ran = super().fire_due(job_id, adapters=adapters, loop=loop)
|
|
if ran:
|
|
from cron.jobs import get_job
|
|
job = get_job(job_id)
|
|
if job and job.get("enabled") and job.get("next_run_at"):
|
|
try:
|
|
self._arm_one_shot(job)
|
|
except Exception as e:
|
|
logger.warning("Chronos failed to re-arm job %s after fire: %s", job_id, e)
|
|
return ran
|
|
|
|
|
|
def register(ctx) -> None:
|
|
"""Plugin entrypoint — register the Chronos provider with the loader.
|
|
|
|
Mirrors the memory-plugin shape; plugins/cron discovery calls this and
|
|
collects the provider via register_cron_scheduler.
|
|
"""
|
|
ctx.register_cron_scheduler(ChronosCronScheduler())
|