fix(cron): repair migrated cron timezone offsets to prevent double-fire

A recurring cron job persists `next_run_at` as an absolute timestamp with a
UTC offset (e.g. `2026-05-19T21:00:00+10:00`). Cron expressions, however,
describe *local wall-clock* intent ("run at 21:00"). When Hermes/system
timezone changes after the timestamp was persisted, the stored instant is
re-interpreted in the new zone: `21:00+10:00` is the instant `13:00+02:00`,
which is `<= now` (13:02+02:00) — so the job fires HOURS EARLY, then
`compute_next_run` advances it via croniter to `21:00+02:00` the same day,
producing a SECOND fire. (#28934, recurrence of #24289.)

`_get_due_jobs_locked` now detects this precise migration case before the
due check: for a `cron` job whose converted instant looks due, whose stored
UTC offset differs from the current zone's, AND whose stored *wall-clock*
time is still in the future (distinguishing a migrated offset from a
genuinely missed run), it recomputes `next_run_at` from the schedule and
skips the early fire — preserving the local wall-clock intent.

Verified against the issue's reproducer: stored `21:00+10` under runtime
`+02:00` at wall-clock `13:02` is rescheduled to `21:00+02` instead of
firing early + again.

Salvaged from #28941 by @Tranquil-Flow (authorship preserved). Chosen over
the alternative approaches (#28951 normalize-to-UTC, #28985 rebase-and-match)
because UTC-normalization does not change the absolute-instant comparison and
so does not fix the early fire, and this guard is the tightest: it only acts
when all four conditions hold and reuses the existing `compute_next_run`.

Fixes #28934
This commit is contained in:
Tranquil-Flow 2026-06-21 13:22:56 +05:30
parent 9f4c0b27c9
commit f1f36b3bae
2 changed files with 213 additions and 3 deletions

View file

@ -409,6 +409,31 @@ def _ensure_aware(dt: datetime) -> datetime:
return dt.astimezone(target_tz)
def _timezone_offset_mismatch(stored: datetime, current: datetime) -> bool:
"""Return True when a stored aware timestamp uses a different UTC offset.
Naive stored timestamps return False: they carry no offset to compare, and
are normalized by ``_ensure_aware`` instead they intentionally never take
the offset-repair path.
"""
if stored.tzinfo is None or current.tzinfo is None:
return False
return stored.utcoffset() != current.utcoffset()
def _stored_wall_clock_is_future(stored: datetime, current: datetime) -> bool:
"""Return True when the stored local wall-clock time has not arrived yet.
Cron schedules express local wall-clock intent. If Hermes/system local time
changes after next_run_at was persisted, an old offset can make a future
wall-clock run look due at the converted absolute time (for example
21:00+10 becomes 13:00+02). Comparing naive wall-clock values lets us
distinguish that migration case from a genuinely missed run whose scheduled
wall time has already passed.
"""
return stored.replace(tzinfo=None) > current.replace(tzinfo=None)
def _recoverable_oneshot_run_at(
schedule: Dict[str, Any],
now: datetime,
@ -1276,10 +1301,50 @@ def _get_due_jobs_locked() -> List[Dict[str, Any]]:
needs_save = True
break
next_run_dt = _ensure_aware(datetime.fromisoformat(next_run))
raw_next_run_dt = datetime.fromisoformat(next_run)
schedule = job.get("schedule", {})
kind = schedule.get("kind")
next_run_dt = _ensure_aware(raw_next_run_dt)
# Migration repair: a cron job persists next_run_at as an absolute
# instant, but the cron expr describes local wall-clock intent. If the
# configured/system timezone changed after persistence, the stored
# instant's offset no longer matches now's, and its converted time can
# look due hours early (21:00+10 -> 13:00+02). When the stored *wall
# clock* is still in the future, recompute from the schedule so we fire
# at the intended local time instead of early-then-again.
#
# TRADE-OFF: this cannot distinguish a config/host TZ migration from a
# legitimate DST offset change. A DST boundary that satisfies all four
# conditions will recompute (and thus SKIP the pending occurrence, no
# catch-up) rather than fire it. Accepted: in the pure-migration case
# the recompute lands on the same wall-clock time later the same period,
# and DST-boundary collisions with a still-future stored wall clock are
# rare relative to the double-fire bug this prevents (#28934).
if (
kind == "cron"
and next_run_dt <= now
and _timezone_offset_mismatch(raw_next_run_dt, now)
and _stored_wall_clock_is_future(raw_next_run_dt, now)
):
new_next = compute_next_run(schedule, now.isoformat())
if new_next:
logger.info(
"Job '%s' next_run_at offset changed (%s -> %s). "
"Recomputing cron run to preserve local wall-clock intent: %s",
job.get("name", job["id"]),
raw_next_run_dt.utcoffset(),
now.utcoffset(),
new_next,
)
for rj in raw_jobs:
if rj["id"] == job["id"]:
rj["next_run_at"] = new_next
needs_save = True
break
continue
if next_run_dt <= now:
schedule = job.get("schedule", {})
kind = schedule.get("kind")
# For recurring jobs, check if the scheduled time is stale
# (gateway was down and missed the window). Fast-forward to

View file

@ -849,6 +849,151 @@ class TestGetDueJobs:
assert recovered_dt > now
def test_cron_next_run_offset_migration_is_rescheduled_not_fired(self, tmp_cron_dir, monkeypatch):
current_tz = timezone(timedelta(hours=2))
now = datetime(2026, 5, 19, 13, 2, 0, tzinfo=current_tz)
monkeypatch.setattr("cron.jobs._hermes_now", lambda: now)
# A 21:00 cron was stored while Hermes/system local time was UTC+10.
# After the host moves to UTC+02, that absolute timestamp converts to
# 13:00+02. At 13:02+02 the old code considered it due and fired, even
# though the user's local wall-clock cron intent is still 21:00.
save_jobs(
[{
"id": "cron-tz-migrate",
"name": "Migrated local cron",
"prompt": "...",
"schedule": {"kind": "cron", "expr": "0 21 * * 2", "display": "0 21 * * 2"},
"schedule_display": "0 21 * * 2",
"repeat": {"times": None, "completed": 0},
"enabled": True,
"state": "scheduled",
"paused_at": None,
"paused_reason": None,
"created_at": "2026-05-12T21:00:00+10:00",
"next_run_at": "2026-05-19T21:00:00+10:00",
"last_run_at": "2026-05-12T21:00:00+10:00",
"last_status": "ok",
"last_error": None,
"deliver": "local",
"origin": None,
}]
)
assert get_due_jobs() == []
repaired = datetime.fromisoformat(get_job("cron-tz-migrate")["next_run_at"])
assert repaired == datetime(2026, 5, 19, 21, 0, 0, tzinfo=current_tz)
def test_cron_offset_migration_does_not_repair_already_passed_wall_time(self, tmp_cron_dir, monkeypatch):
current_tz = timezone(timedelta(hours=2))
now = datetime(2026, 5, 19, 13, 2, 0, tzinfo=current_tz)
monkeypatch.setattr("cron.jobs._hermes_now", lambda: now)
save_jobs(
[{
"id": "cron-tz-missed",
"name": "Migrated missed cron",
"prompt": "...",
"schedule": {"kind": "cron", "expr": "0 9 * * 2", "display": "0 9 * * 2"},
"schedule_display": "0 9 * * 2",
"repeat": {"times": None, "completed": 0},
"enabled": True,
"state": "scheduled",
"paused_at": None,
"paused_reason": None,
"created_at": "2026-05-12T09:00:00+10:00",
"next_run_at": "2026-05-19T09:00:00+10:00",
"last_run_at": "2026-05-12T09:00:00+10:00",
"last_status": "ok",
"last_error": None,
"deliver": "local",
"origin": None,
}]
)
# The wall-clock time has already passed, so this follows the existing
# stale-run fast-forward behavior instead of the timezone-migration
# repair path for future wall-clock runs.
assert get_due_jobs() == []
repaired = datetime.fromisoformat(get_job("cron-tz-missed")["next_run_at"])
assert repaired == datetime(2026, 5, 26, 9, 0, 0, tzinfo=current_tz)
def test_same_tz_due_cron_still_fires(self, tmp_cron_dir, monkeypatch):
"""Guard must NOT over-fire: a due cron in the SAME offset fires normally."""
current_tz = timezone(timedelta(hours=2))
now = datetime(2026, 5, 19, 21, 0, 30, tzinfo=current_tz)
monkeypatch.setattr("cron.jobs._hermes_now", lambda: now)
save_jobs([{
"id": "cron-same-tz", "name": "same tz", "prompt": "...",
"schedule": {"kind": "cron", "expr": "0 21 * * 2", "display": "0 21 * * 2"},
"schedule_display": "0 21 * * 2",
"repeat": {"times": None, "completed": 0},
"enabled": True, "state": "scheduled", "paused_at": None, "paused_reason": None,
"created_at": "2026-05-12T21:00:00+02:00",
"next_run_at": "2026-05-19T21:00:00+02:00", # same offset as now
"last_run_at": "2026-05-12T21:00:00+02:00",
"last_status": "ok", "last_error": None, "deliver": "local", "origin": None,
}])
# offset matches -> guard skips -> the genuinely-due job is returned to fire.
due = get_due_jobs()
assert [j["id"] for j in due] == ["cron-same-tz"]
def test_interval_job_with_stale_offset_is_unaffected(self, tmp_cron_dir, monkeypatch):
"""The offset-repair guard is cron-only; interval jobs never take it.
A stale-offset interval job whose converted instant is well past the
grace window is handled by the pre-existing stale fast-forward path
(not the cron repair path). Verify it fast-forwards via interval math
(next = now + interval), proving the cron-only guard didn't touch it.
"""
current_tz = timezone(timedelta(hours=2))
now = datetime(2026, 5, 19, 13, 2, 0, tzinfo=current_tz)
monkeypatch.setattr("cron.jobs._hermes_now", lambda: now)
save_jobs([{
"id": "interval-stale-tz", "name": "interval", "prompt": "...",
"schedule": {"kind": "interval", "minutes": 60, "display": "every 1h"},
"schedule_display": "every 1h",
"repeat": {"times": None, "completed": 0},
"enabled": True, "state": "scheduled", "paused_at": None, "paused_reason": None,
"created_at": "2026-05-19T10:00:00+10:00",
"next_run_at": "2026-05-19T12:00:00+10:00", # stale offset, instant 04:00+02 (well past)
"last_run_at": "2026-05-19T11:00:00+10:00",
"last_status": "ok", "last_error": None, "deliver": "local", "origin": None,
}])
get_due_jobs()
# The cron-only repair path would have produced a cron occurrence; instead
# the interval stale fast-forward recomputes next = now + 60m (interval
# math), confirming the guard did not intercept this interval job.
nr = datetime.fromisoformat(get_job("interval-stale-tz")["next_run_at"])
assert nr == now + timedelta(minutes=60)
def test_offset_migration_at_wall_clock_equal_now_falls_through(self, tmp_cron_dir, monkeypatch):
"""Boundary: stored wall-clock == now wall-clock (strict >) does NOT take
the repair path it falls through to the existing due/fast-forward logic."""
current_tz = timezone(timedelta(hours=2))
now = datetime(2026, 5, 19, 13, 0, 0, tzinfo=current_tz)
monkeypatch.setattr("cron.jobs._hermes_now", lambda: now)
save_jobs([{
"id": "cron-wall-equal", "name": "wall equal", "prompt": "...",
"schedule": {"kind": "cron", "expr": "0 13 * * 2", "display": "0 13 * * 2"},
"schedule_display": "0 13 * * 2",
"repeat": {"times": None, "completed": 0},
"enabled": True, "state": "scheduled", "paused_at": None, "paused_reason": None,
"created_at": "2026-05-12T13:00:00+10:00",
# stored naive wall-clock 13:00 == now naive wall-clock 13:00 -> strict > is False
"next_run_at": "2026-05-19T13:00:00+10:00",
"last_run_at": "2026-05-12T13:00:00+10:00",
"last_status": "ok", "last_error": None, "deliver": "local", "origin": None,
}])
# _stored_wall_clock_is_future is strict (>), so 13:00 == 13:00 is False
# -> repair guard skipped -> existing logic handles it (does not raise).
get_due_jobs() # must not raise / must not take the repair branch
# next_run_at must NOT have been rewritten to a future cron occurrence by
# the repair path (it either fires or fast-forwards via the normal path).
nr = get_job("cron-wall-equal")["next_run_at"]
assert nr is None or datetime.fromisoformat(nr).utcoffset() == now.utcoffset() or "+10:00" in nr
class TestEnabledToolsets:
def test_enabled_toolsets_stored(self, tmp_cron_dir):
job = create_job(prompt="monitor", schedule="every 1h", enabled_toolsets=["web", "terminal"])