fix(cron): add default retention to per-run job output (#52383) (#52646)

* fix(cron): add default retention to per-run job output to bound disk usage (#52383)

Per-run cron output (cron/output/<job>/<timestamp>.md) is written once
per execution and was never pruned, so a frequently-scheduled job on
a long-running deploy accumulates one file per run indefinitely and
can fill the volume ('no space left on device').

save_job_output() now keeps the most recent N output files per job and
removes older ones. N defaults to 50 and is configurable via
cron.output_retention; a non-positive value disables pruning for
operators who manage cleanup externally.

Salvaged from #52402 by @0xDevNinja.

Closes #52383

* fix(config): add cron.output_retention to DEFAULT_CONFIG

Follow-up to #52383: the retention config key was functional via
get()-with-default but missing from DEFAULT_CONFIG, so the deep-merge
wouldn't auto-populate it for new installs. Add it explicitly.

---------

Co-authored-by: 0xDevNinja <manmit0x@gmail.com>
This commit is contained in:
kshitij 2026-06-26 04:30:13 +05:30 committed by GitHub
parent ffa3d3c811
commit e4ff494860
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 130 additions and 3 deletions

View file

@ -1497,16 +1497,64 @@ def _get_due_jobs_locked() -> List[Dict[str, Any]]:
return due
# Per-run cron output (`cron/output/<job>/<timestamp>.md`) is written once per
# execution. Unlike the quick-snapshot store (`hermes_cli.backup`, capped at 20)
# it had no retention, so a frequently-scheduled job on a long-running deploy
# accumulated one file per run forever and could fill the disk (#52383). Keep the
# most recent N files per job; a non-positive value disables pruning (opt-out).
_CRON_OUTPUT_DEFAULT_KEEP = 50
def _cron_output_keep() -> int:
"""Resolve the per-job output-file retention cap from config (``cron.output_retention``)."""
try:
from hermes_cli.config import load_config
cfg = load_config() or {}
cron_cfg = cfg.get("cron", {}) if isinstance(cfg, dict) else {}
return int(cron_cfg.get("output_retention", _CRON_OUTPUT_DEFAULT_KEEP))
except Exception:
return _CRON_OUTPUT_DEFAULT_KEEP
def _prune_job_output(job_output_dir: Path, keep: int) -> int:
"""Remove the oldest ``*.md`` run-output files beyond *keep*. Returns count deleted.
Mirrors the quick-snapshot retention in ``hermes_cli.backup._prune_quick_snapshots``:
output filenames are timestamp-based (``%Y-%m-%d_%H-%M-%S.md``) so a reverse
lexical sort orders newest-first, and everything past *keep* is the tail to
drop. A non-positive *keep* disables pruning. Pruning failures are swallowed
so they can never break output saving.
"""
if keep <= 0:
return 0
try:
files = sorted(
(f for f in job_output_dir.glob("*.md") if f.is_file()),
key=lambda f: f.name,
reverse=True,
)
except OSError:
return 0
deleted = 0
for stale in files[keep:]:
try:
stale.unlink()
deleted += 1
except OSError as exc:
logger.debug("Failed to prune cron output %s: %s", stale.name, exc)
return deleted
def save_job_output(job_id: str, output: str):
"""Save job output to file."""
ensure_dirs()
job_output_dir = _job_output_dir(job_id)
job_output_dir.mkdir(parents=True, exist_ok=True)
_secure_dir(job_output_dir)
timestamp = _hermes_now().strftime("%Y-%m-%d_%H-%M-%S")
output_file = job_output_dir / f"{timestamp}.md"
fd, tmp_path = tempfile.mkstemp(dir=str(job_output_dir), suffix='.tmp', prefix='.output_')
try:
with os.fdopen(fd, 'w', encoding='utf-8') as f:
@ -1521,7 +1569,10 @@ def save_job_output(job_id: str, output: str):
except OSError:
pass
raise
# Bound per-job output growth so long-running deploys don't fill the disk (#52383).
_prune_job_output(job_output_dir, _cron_output_keep())
return output_file

View file

@ -2454,6 +2454,10 @@ DEFAULT_CONFIG = {
# 1 = serial (pre-v0.9 behaviour).
# Also overridable via HERMES_CRON_MAX_PARALLEL env var.
"max_parallel_jobs": None,
# Per-job output-file retention: save_job_output keeps the N most
# recent .md files and prunes older ones. 0 or negative disables
# pruning (for operators who manage cleanup externally). Default 50.
"output_retention": 50,
},
# Kanban multi-agent coordination — controls the dispatcher loop that

View file

@ -1242,3 +1242,75 @@ class TestSaveJobOutput:
with pytest.raises(ValueError, match="output path"):
save_job_output(str(tmp_cron_dir / "outside"), "# Results")
assert not (tmp_cron_dir / "outside").exists()
class TestCronOutputRetention:
"""Per-run cron output must self-prune so long deploys don't fill the disk (#52383)."""
@staticmethod
def _seed(d, count):
d.mkdir(parents=True, exist_ok=True)
names = [f"2026-06-25_10-00-{i:02d}.md" for i in range(count)]
for n in names:
(d / n).write_text("x")
return names
def test_prune_keeps_newest_n(self, tmp_path):
from cron.jobs import _prune_job_output
d = tmp_path / "job"
names = self._seed(d, 10)
assert _prune_job_output(d, keep=3) == 7
assert sorted(p.name for p in d.glob("*.md")) == names[-3:]
def test_prune_noop_when_under_cap(self, tmp_path):
from cron.jobs import _prune_job_output
d = tmp_path / "job"
self._seed(d, 3)
assert _prune_job_output(d, keep=5) == 0
assert len(list(d.glob("*.md"))) == 3
def test_prune_disabled_when_keep_non_positive(self, tmp_path):
from cron.jobs import _prune_job_output
d = tmp_path / "job"
self._seed(d, 5)
assert _prune_job_output(d, keep=0) == 0
assert _prune_job_output(d, keep=-1) == 0
assert len(list(d.glob("*.md"))) == 5
def test_prune_ignores_non_md_and_temp_files(self, tmp_path):
from cron.jobs import _prune_job_output
d = tmp_path / "job"
self._seed(d, 4)
(d / ".output_abc.tmp").write_text("partial")
(d / "manifest.json").write_text("{}")
_prune_job_output(d, keep=2)
assert (d / ".output_abc.tmp").exists()
assert (d / "manifest.json").exists()
assert len(list(d.glob("*.md"))) == 2
def test_save_job_output_prunes_old_runs(self, tmp_cron_dir, monkeypatch):
from cron.jobs import save_job_output, _job_output_dir
monkeypatch.setattr("cron.jobs._cron_output_keep", lambda: 3)
seq = iter(
datetime(2026, 6, 25, 10, 0, 0, tzinfo=timezone.utc) + timedelta(seconds=i)
for i in range(8)
)
monkeypatch.setattr("cron.jobs._hermes_now", lambda: next(seq))
for _ in range(8):
save_job_output("job1", "report")
files = sorted(_job_output_dir("job1").glob("*.md"))
assert len(files) == 3 # only the 3 most-recent runs survive
def test_cron_output_keep_reads_config(self, monkeypatch):
import cron.jobs as jobs
monkeypatch.setattr(
"hermes_cli.config.load_config", lambda: {"cron": {"output_retention": 7}}
)
assert jobs._cron_output_keep() == 7
def test_cron_output_keep_defaults_on_bad_config(self, monkeypatch):
import cron.jobs as jobs
monkeypatch.setattr(
"hermes_cli.config.load_config", lambda: {"cron": {"output_retention": "oops"}}
)
assert jobs._cron_output_keep() == jobs._CRON_OUTPUT_DEFAULT_KEEP