From a5e06078b2ecb6201ed5332c88ab9df553d04c97 Mon Sep 17 00:00:00 2001 From: Sahil Saghir <218421507+Sahil-SS9@users.noreply.github.com> Date: Thu, 18 Jun 2026 15:46:55 +0100 Subject: [PATCH] fix(cron): compact cron failure messages + repair bare repo dirs after git gc Two small, focused fixes for the cron scheduler and checkpoint manager. 1. _summarize_cron_failure_for_delivery (cron/scheduler.py): Replaces the raw error dump in _process_job with a compact pattern-matched summary. Provider rate limits, timeouts, and authentication errors now produce a short human-readable message instead of dumping multi-KB provider JSON into the delivery channel. 2. _repair_bare_repo_dirs (tools/checkpoint_manager.py): Recreates refs/heads/ and branches/ directories after git gc --prune=now, which can remove empty dirs from bare repos and cause subsequent git add -A to fail with 'fatal: not a git repository'. Called after all four git gc call sites. Both fixes use only standard library imports and plug into existing call sites with no architectural changes. --- cron/scheduler.py | 56 ++++++++++++++++++++++++++++++++++++- tools/checkpoint_manager.py | 26 +++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/cron/scheduler.py b/cron/scheduler.py index 35906996619..d010763b33d 100644 --- a/cron/scheduler.py +++ b/cron/scheduler.py @@ -15,6 +15,7 @@ import contextvars import json import logging import os +import re import shutil import subprocess import sys @@ -45,6 +46,59 @@ from hermes_time import now as _hermes_now logger = logging.getLogger(__name__) +def _summarize_cron_failure_for_delivery(job: dict, error: str | None) -> str: + """Return a compact one-line failure message for chat delivery. + + Full details stay in the cron output directory and the logs. Chat should + show the operator what broke without dumping provider JSON, retry noise, or + stack traces into the delivery channel. + """ + job_name = job.get("name") or job.get("id") or "cron job" + text = (error or "unknown error").strip() + lower = text.lower() + + # Provider/API failures are the common noisy path. Keep these short. + if "429" in text or "rate limit" in lower or "usage limit" in lower: + reason = "rate limit" + if "weekly usage limit" in lower: + reason = "weekly usage limit" + elif "quota" in lower: + reason = "quota limit" + return ( + f"⚠️ Cron '{job_name}' failed: provider {reason}. " + "Fallback chain was exhausted or unavailable. " + "Full details saved in cron output." + ) + + if "readtimeout" in lower or "timed out" in lower or "timeout" in lower: + return ( + f"⚠️ Cron '{job_name}' failed: provider timeout. " + "Fallback chain was exhausted or unavailable. " + "Full details saved in cron output." + ) + + # Match authentication/authorization wording at a word boundary and the + # 401/403 status codes as whole tokens, so "oauth", "4015" and similar do + # not trip a misleading auth message. + if re.search(r"authenticat|authoriz", lower) or re.search(r"\b(401|403)\b", text): + return ( + f"⚠️ Cron '{job_name}' failed: provider authentication error. " + "Full details saved in cron output." + ) + + # Strip common exception wrappers and collapse provider payloads. Bound + # the input first so a multi-KB provider blob cannot slow the + # substitutions. + cleaned = re.sub( + r"^(RuntimeError|Exception|ValueError|HTTPStatusError):\s*", + "", text[:2000], + ) + cleaned = re.sub(r"\s+", " ", cleaned).strip() + if len(cleaned) > 180: + cleaned = cleaned[:177].rstrip() + "..." + return f"⚠️ Cron '{job_name}' failed: {cleaned}" + + class CronPromptInjectionBlocked(Exception): """Raised by _build_job_prompt when the fully-assembled prompt trips the injection scanner. Caught in run_job so the operator sees a clean @@ -2056,7 +2110,7 @@ def tick(verbose: bool = True, adapters=None, loop=None, sync: bool = True) -> i # Deliver the final response to the origin/target chat. # If the agent responded with [SILENT], skip delivery (but # output is already saved above). Failed jobs always deliver. - deliver_content = final_response if success else f"⚠️ Cron job '{job.get('name', job['id'])}' failed:\n{error}" + deliver_content = final_response if success else _summarize_cron_failure_for_delivery(job, error) # Treat whitespace-only final responses the same as empty # responses: do not deliver a blank message, and let the # empty-response guard below mark the run as a soft failure. diff --git a/tools/checkpoint_manager.py b/tools/checkpoint_manager.py index f0b47734cea..720973b67e0 100644 --- a/tools/checkpoint_manager.py +++ b/tools/checkpoint_manager.py @@ -272,6 +272,28 @@ def _git_env( return env +def _repair_bare_repo_dirs(store: Path) -> None: + """Recreate refs/ and branches/ dirs that ``git gc`` may have removed. + + ``git gc --prune=now`` on a bare repo with only packed refs can remove + the empty ``refs/heads/`` directory. Git 2.34+ requires ``refs/`` (and + some versions require ``branches/``) to exist even when all refs are + packed in ``packed-refs``. Without them, ``git add -A`` returns + ``fatal: not a git repository`` and all checkpoint operations fail + silently. + """ + for subdir in ("refs/heads", "branches"): + path = store / subdir + if not path.exists(): + try: + path.mkdir(parents=True, exist_ok=True) + logger.debug("Repaired missing %s in checkpoint store", subdir) + except OSError as exc: + logger.warning( + "Cannot create %s in checkpoint store: %s", subdir, exc, + ) + + def _run_git( args: List[str], store: Path, @@ -1086,6 +1108,7 @@ class CheckpointManager: ["gc", "--prune=now", "--quiet"], store, working_dir, timeout=_GIT_TIMEOUT * 3, ) + _repair_bare_repo_dirs(store) def _enforce_size_cap(self, store: Path) -> None: """If total store size exceeds ``max_total_size_mb``, drop oldest @@ -1173,6 +1196,7 @@ class CheckpointManager: ["gc", "--prune=now", "--quiet"], store, str(store.parent), timeout=_GIT_TIMEOUT * 3, ) + _repair_bare_repo_dirs(store) def format_checkpoint_list(checkpoints: List[Dict], directory: str) -> str: @@ -1384,6 +1408,7 @@ def prune_checkpoints( ["gc", "--prune=now", "--quiet"], store, str(base), timeout=_GIT_TIMEOUT * 3, ) + _repair_bare_repo_dirs(store) # Size-cap pass across remaining projects. if max_total_size_mb > 0: @@ -1455,6 +1480,7 @@ def prune_checkpoints( ["gc", "--prune=now", "--quiet"], store, str(base), timeout=_GIT_TIMEOUT * 3, ) + _repair_bare_repo_dirs(store) size_after = _dir_size_bytes(base) delta = size_before - size_after