fix(cron): compact cron failure messages + repair bare repo dirs after git gc

Two small, focused fixes for the cron scheduler and checkpoint manager.

1. _summarize_cron_failure_for_delivery (cron/scheduler.py):
   Replaces the raw error dump in _process_job with a compact
   pattern-matched summary. Provider rate limits, timeouts, and
   authentication errors now produce a short human-readable message
   instead of dumping multi-KB provider JSON into the delivery channel.

2. _repair_bare_repo_dirs (tools/checkpoint_manager.py):
   Recreates refs/heads/ and branches/ directories after git gc
   --prune=now, which can remove empty dirs from bare repos and cause
   subsequent git add -A to fail with 'fatal: not a git repository'.
   Called after all four git gc call sites.

Both fixes use only standard library imports and plug into existing
call sites with no architectural changes.
This commit is contained in:
Sahil Saghir 2026-06-18 15:46:55 +01:00 committed by Teknium
parent 1958208744
commit a5e06078b2
2 changed files with 81 additions and 1 deletions

View file

@ -15,6 +15,7 @@ import contextvars
import json
import logging
import os
import re
import shutil
import subprocess
import sys
@ -45,6 +46,59 @@ from hermes_time import now as _hermes_now
logger = logging.getLogger(__name__)
def _summarize_cron_failure_for_delivery(job: dict, error: str | None) -> str:
"""Return a compact one-line failure message for chat delivery.
Full details stay in the cron output directory and the logs. Chat should
show the operator what broke without dumping provider JSON, retry noise, or
stack traces into the delivery channel.
"""
job_name = job.get("name") or job.get("id") or "cron job"
text = (error or "unknown error").strip()
lower = text.lower()
# Provider/API failures are the common noisy path. Keep these short.
if "429" in text or "rate limit" in lower or "usage limit" in lower:
reason = "rate limit"
if "weekly usage limit" in lower:
reason = "weekly usage limit"
elif "quota" in lower:
reason = "quota limit"
return (
f"⚠️ Cron '{job_name}' failed: provider {reason}. "
"Fallback chain was exhausted or unavailable. "
"Full details saved in cron output."
)
if "readtimeout" in lower or "timed out" in lower or "timeout" in lower:
return (
f"⚠️ Cron '{job_name}' failed: provider timeout. "
"Fallback chain was exhausted or unavailable. "
"Full details saved in cron output."
)
# Match authentication/authorization wording at a word boundary and the
# 401/403 status codes as whole tokens, so "oauth", "4015" and similar do
# not trip a misleading auth message.
if re.search(r"authenticat|authoriz", lower) or re.search(r"\b(401|403)\b", text):
return (
f"⚠️ Cron '{job_name}' failed: provider authentication error. "
"Full details saved in cron output."
)
# Strip common exception wrappers and collapse provider payloads. Bound
# the input first so a multi-KB provider blob cannot slow the
# substitutions.
cleaned = re.sub(
r"^(RuntimeError|Exception|ValueError|HTTPStatusError):\s*",
"", text[:2000],
)
cleaned = re.sub(r"\s+", " ", cleaned).strip()
if len(cleaned) > 180:
cleaned = cleaned[:177].rstrip() + "..."
return f"⚠️ Cron '{job_name}' failed: {cleaned}"
class CronPromptInjectionBlocked(Exception):
"""Raised by _build_job_prompt when the fully-assembled prompt trips the
injection scanner. Caught in run_job so the operator sees a clean
@ -2056,7 +2110,7 @@ def tick(verbose: bool = True, adapters=None, loop=None, sync: bool = True) -> i
# Deliver the final response to the origin/target chat.
# If the agent responded with [SILENT], skip delivery (but
# output is already saved above). Failed jobs always deliver.
deliver_content = final_response if success else f"⚠️ Cron job '{job.get('name', job['id'])}' failed:\n{error}"
deliver_content = final_response if success else _summarize_cron_failure_for_delivery(job, error)
# Treat whitespace-only final responses the same as empty
# responses: do not deliver a blank message, and let the
# empty-response guard below mark the run as a soft failure.

View file

@ -272,6 +272,28 @@ def _git_env(
return env
def _repair_bare_repo_dirs(store: Path) -> None:
"""Recreate refs/ and branches/ dirs that ``git gc`` may have removed.
``git gc --prune=now`` on a bare repo with only packed refs can remove
the empty ``refs/heads/`` directory. Git 2.34+ requires ``refs/`` (and
some versions require ``branches/``) to exist even when all refs are
packed in ``packed-refs``. Without them, ``git add -A`` returns
``fatal: not a git repository`` and all checkpoint operations fail
silently.
"""
for subdir in ("refs/heads", "branches"):
path = store / subdir
if not path.exists():
try:
path.mkdir(parents=True, exist_ok=True)
logger.debug("Repaired missing %s in checkpoint store", subdir)
except OSError as exc:
logger.warning(
"Cannot create %s in checkpoint store: %s", subdir, exc,
)
def _run_git(
args: List[str],
store: Path,
@ -1086,6 +1108,7 @@ class CheckpointManager:
["gc", "--prune=now", "--quiet"],
store, working_dir, timeout=_GIT_TIMEOUT * 3,
)
_repair_bare_repo_dirs(store)
def _enforce_size_cap(self, store: Path) -> None:
"""If total store size exceeds ``max_total_size_mb``, drop oldest
@ -1173,6 +1196,7 @@ class CheckpointManager:
["gc", "--prune=now", "--quiet"],
store, str(store.parent), timeout=_GIT_TIMEOUT * 3,
)
_repair_bare_repo_dirs(store)
def format_checkpoint_list(checkpoints: List[Dict], directory: str) -> str:
@ -1384,6 +1408,7 @@ def prune_checkpoints(
["gc", "--prune=now", "--quiet"],
store, str(base), timeout=_GIT_TIMEOUT * 3,
)
_repair_bare_repo_dirs(store)
# Size-cap pass across remaining projects.
if max_total_size_mb > 0:
@ -1455,6 +1480,7 @@ def prune_checkpoints(
["gc", "--prune=now", "--quiet"],
store, str(base), timeout=_GIT_TIMEOUT * 3,
)
_repair_bare_repo_dirs(store)
size_after = _dir_size_bytes(base)
delta = size_before - size_after