mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
* fix(cron): add default retention to per-run job output to bound disk usage (#52383) Per-run cron output (cron/output/<job>/<timestamp>.md) is written once per execution and was never pruned, so a frequently-scheduled job on a long-running deploy accumulates one file per run indefinitely and can fill the volume ('no space left on device'). save_job_output() now keeps the most recent N output files per job and removes older ones. N defaults to 50 and is configurable via cron.output_retention; a non-positive value disables pruning for operators who manage cleanup externally. Salvaged from #52402 by @0xDevNinja. Closes #52383 * fix(config): add cron.output_retention to DEFAULT_CONFIG Follow-up to #52383: the retention config key was functional via get()-with-default but missing from DEFAULT_CONFIG, so the deep-merge wouldn't auto-populate it for new installs. Add it explicitly. --------- Co-authored-by: 0xDevNinja <manmit0x@gmail.com>
This commit is contained in:
parent
ffa3d3c811
commit
e4ff494860
3 changed files with 130 additions and 3 deletions
57
cron/jobs.py
57
cron/jobs.py
|
|
@ -1497,16 +1497,64 @@ def _get_due_jobs_locked() -> List[Dict[str, Any]]:
|
|||
return due
|
||||
|
||||
|
||||
# Per-run cron output (`cron/output/<job>/<timestamp>.md`) is written once per
|
||||
# execution. Unlike the quick-snapshot store (`hermes_cli.backup`, capped at 20)
|
||||
# it had no retention, so a frequently-scheduled job on a long-running deploy
|
||||
# accumulated one file per run forever and could fill the disk (#52383). Keep the
|
||||
# most recent N files per job; a non-positive value disables pruning (opt-out).
|
||||
_CRON_OUTPUT_DEFAULT_KEEP = 50
|
||||
|
||||
|
||||
def _cron_output_keep() -> int:
|
||||
"""Resolve the per-job output-file retention cap from config (``cron.output_retention``)."""
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
cfg = load_config() or {}
|
||||
cron_cfg = cfg.get("cron", {}) if isinstance(cfg, dict) else {}
|
||||
return int(cron_cfg.get("output_retention", _CRON_OUTPUT_DEFAULT_KEEP))
|
||||
except Exception:
|
||||
return _CRON_OUTPUT_DEFAULT_KEEP
|
||||
|
||||
|
||||
def _prune_job_output(job_output_dir: Path, keep: int) -> int:
|
||||
"""Remove the oldest ``*.md`` run-output files beyond *keep*. Returns count deleted.
|
||||
|
||||
Mirrors the quick-snapshot retention in ``hermes_cli.backup._prune_quick_snapshots``:
|
||||
output filenames are timestamp-based (``%Y-%m-%d_%H-%M-%S.md``) so a reverse
|
||||
lexical sort orders newest-first, and everything past *keep* is the tail to
|
||||
drop. A non-positive *keep* disables pruning. Pruning failures are swallowed
|
||||
so they can never break output saving.
|
||||
"""
|
||||
if keep <= 0:
|
||||
return 0
|
||||
try:
|
||||
files = sorted(
|
||||
(f for f in job_output_dir.glob("*.md") if f.is_file()),
|
||||
key=lambda f: f.name,
|
||||
reverse=True,
|
||||
)
|
||||
except OSError:
|
||||
return 0
|
||||
deleted = 0
|
||||
for stale in files[keep:]:
|
||||
try:
|
||||
stale.unlink()
|
||||
deleted += 1
|
||||
except OSError as exc:
|
||||
logger.debug("Failed to prune cron output %s: %s", stale.name, exc)
|
||||
return deleted
|
||||
|
||||
|
||||
def save_job_output(job_id: str, output: str):
|
||||
"""Save job output to file."""
|
||||
ensure_dirs()
|
||||
job_output_dir = _job_output_dir(job_id)
|
||||
job_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
_secure_dir(job_output_dir)
|
||||
|
||||
|
||||
timestamp = _hermes_now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
output_file = job_output_dir / f"{timestamp}.md"
|
||||
|
||||
|
||||
fd, tmp_path = tempfile.mkstemp(dir=str(job_output_dir), suffix='.tmp', prefix='.output_')
|
||||
try:
|
||||
with os.fdopen(fd, 'w', encoding='utf-8') as f:
|
||||
|
|
@ -1521,7 +1569,10 @@ def save_job_output(job_id: str, output: str):
|
|||
except OSError:
|
||||
pass
|
||||
raise
|
||||
|
||||
|
||||
# Bound per-job output growth so long-running deploys don't fill the disk (#52383).
|
||||
_prune_job_output(job_output_dir, _cron_output_keep())
|
||||
|
||||
return output_file
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -2454,6 +2454,10 @@ DEFAULT_CONFIG = {
|
|||
# 1 = serial (pre-v0.9 behaviour).
|
||||
# Also overridable via HERMES_CRON_MAX_PARALLEL env var.
|
||||
"max_parallel_jobs": None,
|
||||
# Per-job output-file retention: save_job_output keeps the N most
|
||||
# recent .md files and prunes older ones. 0 or negative disables
|
||||
# pruning (for operators who manage cleanup externally). Default 50.
|
||||
"output_retention": 50,
|
||||
},
|
||||
|
||||
# Kanban multi-agent coordination — controls the dispatcher loop that
|
||||
|
|
|
|||
|
|
@ -1242,3 +1242,75 @@ class TestSaveJobOutput:
|
|||
with pytest.raises(ValueError, match="output path"):
|
||||
save_job_output(str(tmp_cron_dir / "outside"), "# Results")
|
||||
assert not (tmp_cron_dir / "outside").exists()
|
||||
|
||||
|
||||
class TestCronOutputRetention:
|
||||
"""Per-run cron output must self-prune so long deploys don't fill the disk (#52383)."""
|
||||
|
||||
@staticmethod
|
||||
def _seed(d, count):
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
names = [f"2026-06-25_10-00-{i:02d}.md" for i in range(count)]
|
||||
for n in names:
|
||||
(d / n).write_text("x")
|
||||
return names
|
||||
|
||||
def test_prune_keeps_newest_n(self, tmp_path):
|
||||
from cron.jobs import _prune_job_output
|
||||
d = tmp_path / "job"
|
||||
names = self._seed(d, 10)
|
||||
assert _prune_job_output(d, keep=3) == 7
|
||||
assert sorted(p.name for p in d.glob("*.md")) == names[-3:]
|
||||
|
||||
def test_prune_noop_when_under_cap(self, tmp_path):
|
||||
from cron.jobs import _prune_job_output
|
||||
d = tmp_path / "job"
|
||||
self._seed(d, 3)
|
||||
assert _prune_job_output(d, keep=5) == 0
|
||||
assert len(list(d.glob("*.md"))) == 3
|
||||
|
||||
def test_prune_disabled_when_keep_non_positive(self, tmp_path):
|
||||
from cron.jobs import _prune_job_output
|
||||
d = tmp_path / "job"
|
||||
self._seed(d, 5)
|
||||
assert _prune_job_output(d, keep=0) == 0
|
||||
assert _prune_job_output(d, keep=-1) == 0
|
||||
assert len(list(d.glob("*.md"))) == 5
|
||||
|
||||
def test_prune_ignores_non_md_and_temp_files(self, tmp_path):
|
||||
from cron.jobs import _prune_job_output
|
||||
d = tmp_path / "job"
|
||||
self._seed(d, 4)
|
||||
(d / ".output_abc.tmp").write_text("partial")
|
||||
(d / "manifest.json").write_text("{}")
|
||||
_prune_job_output(d, keep=2)
|
||||
assert (d / ".output_abc.tmp").exists()
|
||||
assert (d / "manifest.json").exists()
|
||||
assert len(list(d.glob("*.md"))) == 2
|
||||
|
||||
def test_save_job_output_prunes_old_runs(self, tmp_cron_dir, monkeypatch):
|
||||
from cron.jobs import save_job_output, _job_output_dir
|
||||
monkeypatch.setattr("cron.jobs._cron_output_keep", lambda: 3)
|
||||
seq = iter(
|
||||
datetime(2026, 6, 25, 10, 0, 0, tzinfo=timezone.utc) + timedelta(seconds=i)
|
||||
for i in range(8)
|
||||
)
|
||||
monkeypatch.setattr("cron.jobs._hermes_now", lambda: next(seq))
|
||||
for _ in range(8):
|
||||
save_job_output("job1", "report")
|
||||
files = sorted(_job_output_dir("job1").glob("*.md"))
|
||||
assert len(files) == 3 # only the 3 most-recent runs survive
|
||||
|
||||
def test_cron_output_keep_reads_config(self, monkeypatch):
|
||||
import cron.jobs as jobs
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.config.load_config", lambda: {"cron": {"output_retention": 7}}
|
||||
)
|
||||
assert jobs._cron_output_keep() == 7
|
||||
|
||||
def test_cron_output_keep_defaults_on_bad_config(self, monkeypatch):
|
||||
import cron.jobs as jobs
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.config.load_config", lambda: {"cron": {"output_retention": "oops"}}
|
||||
)
|
||||
assert jobs._cron_output_keep() == jobs._CRON_OUTPUT_DEFAULT_KEEP
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue