mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-21 10:22:18 +00:00
fix(cron): compact cron failure messages + repair bare repo dirs after git gc
Two small, focused fixes for the cron scheduler and checkpoint manager. 1. _summarize_cron_failure_for_delivery (cron/scheduler.py): Replaces the raw error dump in _process_job with a compact pattern-matched summary. Provider rate limits, timeouts, and authentication errors now produce a short human-readable message instead of dumping multi-KB provider JSON into the delivery channel. 2. _repair_bare_repo_dirs (tools/checkpoint_manager.py): Recreates refs/heads/ and branches/ directories after git gc --prune=now, which can remove empty dirs from bare repos and cause subsequent git add -A to fail with 'fatal: not a git repository'. Called after all four git gc call sites. Both fixes use only standard library imports and plug into existing call sites with no architectural changes.
This commit is contained in:
parent
1958208744
commit
a5e06078b2
2 changed files with 81 additions and 1 deletions
|
|
@ -15,6 +15,7 @@ import contextvars
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
|
|
@ -45,6 +46,59 @@ from hermes_time import now as _hermes_now
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _summarize_cron_failure_for_delivery(job: dict, error: str | None) -> str:
|
||||
"""Return a compact one-line failure message for chat delivery.
|
||||
|
||||
Full details stay in the cron output directory and the logs. Chat should
|
||||
show the operator what broke without dumping provider JSON, retry noise, or
|
||||
stack traces into the delivery channel.
|
||||
"""
|
||||
job_name = job.get("name") or job.get("id") or "cron job"
|
||||
text = (error or "unknown error").strip()
|
||||
lower = text.lower()
|
||||
|
||||
# Provider/API failures are the common noisy path. Keep these short.
|
||||
if "429" in text or "rate limit" in lower or "usage limit" in lower:
|
||||
reason = "rate limit"
|
||||
if "weekly usage limit" in lower:
|
||||
reason = "weekly usage limit"
|
||||
elif "quota" in lower:
|
||||
reason = "quota limit"
|
||||
return (
|
||||
f"⚠️ Cron '{job_name}' failed: provider {reason}. "
|
||||
"Fallback chain was exhausted or unavailable. "
|
||||
"Full details saved in cron output."
|
||||
)
|
||||
|
||||
if "readtimeout" in lower or "timed out" in lower or "timeout" in lower:
|
||||
return (
|
||||
f"⚠️ Cron '{job_name}' failed: provider timeout. "
|
||||
"Fallback chain was exhausted or unavailable. "
|
||||
"Full details saved in cron output."
|
||||
)
|
||||
|
||||
# Match authentication/authorization wording at a word boundary and the
|
||||
# 401/403 status codes as whole tokens, so "oauth", "4015" and similar do
|
||||
# not trip a misleading auth message.
|
||||
if re.search(r"authenticat|authoriz", lower) or re.search(r"\b(401|403)\b", text):
|
||||
return (
|
||||
f"⚠️ Cron '{job_name}' failed: provider authentication error. "
|
||||
"Full details saved in cron output."
|
||||
)
|
||||
|
||||
# Strip common exception wrappers and collapse provider payloads. Bound
|
||||
# the input first so a multi-KB provider blob cannot slow the
|
||||
# substitutions.
|
||||
cleaned = re.sub(
|
||||
r"^(RuntimeError|Exception|ValueError|HTTPStatusError):\s*",
|
||||
"", text[:2000],
|
||||
)
|
||||
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
||||
if len(cleaned) > 180:
|
||||
cleaned = cleaned[:177].rstrip() + "..."
|
||||
return f"⚠️ Cron '{job_name}' failed: {cleaned}"
|
||||
|
||||
|
||||
class CronPromptInjectionBlocked(Exception):
|
||||
"""Raised by _build_job_prompt when the fully-assembled prompt trips the
|
||||
injection scanner. Caught in run_job so the operator sees a clean
|
||||
|
|
@ -2056,7 +2110,7 @@ def tick(verbose: bool = True, adapters=None, loop=None, sync: bool = True) -> i
|
|||
# Deliver the final response to the origin/target chat.
|
||||
# If the agent responded with [SILENT], skip delivery (but
|
||||
# output is already saved above). Failed jobs always deliver.
|
||||
deliver_content = final_response if success else f"⚠️ Cron job '{job.get('name', job['id'])}' failed:\n{error}"
|
||||
deliver_content = final_response if success else _summarize_cron_failure_for_delivery(job, error)
|
||||
# Treat whitespace-only final responses the same as empty
|
||||
# responses: do not deliver a blank message, and let the
|
||||
# empty-response guard below mark the run as a soft failure.
|
||||
|
|
|
|||
|
|
@ -272,6 +272,28 @@ def _git_env(
|
|||
return env
|
||||
|
||||
|
||||
def _repair_bare_repo_dirs(store: Path) -> None:
|
||||
"""Recreate refs/ and branches/ dirs that ``git gc`` may have removed.
|
||||
|
||||
``git gc --prune=now`` on a bare repo with only packed refs can remove
|
||||
the empty ``refs/heads/`` directory. Git 2.34+ requires ``refs/`` (and
|
||||
some versions require ``branches/``) to exist even when all refs are
|
||||
packed in ``packed-refs``. Without them, ``git add -A`` returns
|
||||
``fatal: not a git repository`` and all checkpoint operations fail
|
||||
silently.
|
||||
"""
|
||||
for subdir in ("refs/heads", "branches"):
|
||||
path = store / subdir
|
||||
if not path.exists():
|
||||
try:
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
logger.debug("Repaired missing %s in checkpoint store", subdir)
|
||||
except OSError as exc:
|
||||
logger.warning(
|
||||
"Cannot create %s in checkpoint store: %s", subdir, exc,
|
||||
)
|
||||
|
||||
|
||||
def _run_git(
|
||||
args: List[str],
|
||||
store: Path,
|
||||
|
|
@ -1086,6 +1108,7 @@ class CheckpointManager:
|
|||
["gc", "--prune=now", "--quiet"],
|
||||
store, working_dir, timeout=_GIT_TIMEOUT * 3,
|
||||
)
|
||||
_repair_bare_repo_dirs(store)
|
||||
|
||||
def _enforce_size_cap(self, store: Path) -> None:
|
||||
"""If total store size exceeds ``max_total_size_mb``, drop oldest
|
||||
|
|
@ -1173,6 +1196,7 @@ class CheckpointManager:
|
|||
["gc", "--prune=now", "--quiet"],
|
||||
store, str(store.parent), timeout=_GIT_TIMEOUT * 3,
|
||||
)
|
||||
_repair_bare_repo_dirs(store)
|
||||
|
||||
|
||||
def format_checkpoint_list(checkpoints: List[Dict], directory: str) -> str:
|
||||
|
|
@ -1384,6 +1408,7 @@ def prune_checkpoints(
|
|||
["gc", "--prune=now", "--quiet"],
|
||||
store, str(base), timeout=_GIT_TIMEOUT * 3,
|
||||
)
|
||||
_repair_bare_repo_dirs(store)
|
||||
|
||||
# Size-cap pass across remaining projects.
|
||||
if max_total_size_mb > 0:
|
||||
|
|
@ -1455,6 +1480,7 @@ def prune_checkpoints(
|
|||
["gc", "--prune=now", "--quiet"],
|
||||
store, str(base), timeout=_GIT_TIMEOUT * 3,
|
||||
)
|
||||
_repair_bare_repo_dirs(store)
|
||||
|
||||
size_after = _dir_size_bytes(base)
|
||||
delta = size_before - size_after
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue