diff --git a/hermes_cli/config.py b/hermes_cli/config.py index ebaa4d3c140..6f4adb56f3e 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -2087,6 +2087,22 @@ DEFAULT_CONFIG = { "inherit_mcp_toolsets": True, "max_iterations": 50, # per-subagent iteration cap (each subagent gets its own budget, # independent of the parent's max_iterations) + # Subagent summaries return to the parent's context verbatim. A batch + # fan-out (N children) returns N summaries at once, which can exceed + # the parent's context window and trigger a compression/429 death + # spiral. delegate_task sizes each summary against the parent's + # remaining context headroom (split across the batch); when it must + # trim, the full text is spilled to ~/.hermes/cache/delegation/ + # (mounted into remote backends) and the in-context summary becomes a + # head+tail window plus a footer with the exact read_file offset to + # page the omitted middle — the same convention web_extract uses for + # large pages. Nothing is lost. max_summary_chars is a hard per-summary + # character ceiling layered on top of that dynamic budget + # (belt-and-suspenders for models that ignore the "be concise" + # instruction). 0 disables the hard ceiling; the dynamic headroom + # budget still applies. + "max_summary_chars": 24000, + "child_timeout_seconds": 0, # optional wall-clock cap per child agent. 0 (default) # = no timeout: children fail only from real errors # (API, tools, iteration budget), never a delegation diff --git a/scripts/release.py b/scripts/release.py index 0560809f61e..f2a4fafdb71 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -79,6 +79,7 @@ AUTHOR_MAP = { "15205536+595650661@users.noreply.github.com": "595650661", # PR #37851 salvage (classify MiniMax new_sensitive content filter → content_policy_blocked; #32421) "benbenwyb@gmail.com": "benbenlijie", # PR #47205 salvage (named custom-provider extra_body + Z.AI Coding overload adaptive backoff; #50663) "dana@added-value.co.il": "Danamove", # PR #46726 salvage (kill venv-resident pythonw gateway before recreating venv on Windows; #47036/#47557/#47910) + "rcint@klaith.com": "rc-int", # PR #9126 salvage / co-author (cap subagent summary size vs parent context overflow) "145739220+wgu9@users.noreply.github.com": "wgu9", # PR #51468 salvage (WSL/no-systemd orphan gateway tracking, #51325) "165020384+uperLu@users.noreply.github.com": "uperLu", # PR #50958 salvage (rename plugins/cron → plugins/cron_providers; #50872) "277269729+yusekiotacode@users.noreply.github.com": "yusekiotacode", # PR #48706 salvage (anthropic OAuth login token endpoint → platform.claude.com; #45250/#49821) diff --git a/tests/tools/test_delegate_summary_budget.py b/tests/tools/test_delegate_summary_budget.py new file mode 100644 index 00000000000..4039892ddd1 --- /dev/null +++ b/tests/tools/test_delegate_summary_budget.py @@ -0,0 +1,126 @@ +"""Tests for subagent summary budgeting (PR #9126). + +delegate_task caps subagent summaries against the parent's remaining context +headroom (split across the batch) before they enter the parent's context, and +spills the full text to disk so nothing is lost. This guards the +compression/429 death spiral that batch fan-out could trigger by returning N +full summaries verbatim into the parent. +""" + +import os +import tempfile + +import pytest + +import tools.delegate_tool as dt + + +class _FakeCompressor: + def __init__(self, context_length, max_tokens): + self.context_length = context_length + self.max_tokens = max_tokens + + +class _FakeParent: + def __init__(self, context_length, used_tokens, max_tokens): + self.context_compressor = _FakeCompressor(context_length, max_tokens) + self.session_prompt_tokens = used_tokens + + +def test_small_summaries_pass_through_untouched(): + parent = _FakeParent(context_length=200_000, used_tokens=10_000, max_tokens=8_000) + results = [ + {"task_index": 0, "summary": "short result A", "status": "completed"}, + {"task_index": 1, "summary": "short result B", "status": "completed"}, + ] + dt._apply_summary_budget(results, parent) + assert results[0]["summary"] == "short result A" + assert "summary_truncated" not in results[0] + assert "summary_truncated" not in results[1] + + +def test_batch_overflow_trimmed_and_spilled_losslessly(monkeypatch): + # Isolate spill directory to a temp HERMES_HOME. + with tempfile.TemporaryDirectory() as td: + monkeypatch.setenv("HERMES_HOME", os.path.join(td, ".hermes")) + # Distinct head + tail markers so we can prove the tail survives. + big = "HEAD_MARKER\n" + ("X" * 50_000) + "\nTAIL_MARKER" + # Parent nearly full (120k/131k) → tiny headroom → aggressive trim. + parent = _FakeParent(context_length=131_000, used_tokens=120_000, max_tokens=8_000) + results = [ + {"task_index": i, "summary": big, "status": "completed"} for i in range(5) + ] + dt._apply_summary_budget(results, parent) + for r in results: + assert r["summary_truncated"] is True + assert len(r["summary"]) < len(big) + # Head+tail window: both ends survive in-context. + assert "HEAD_MARKER" in r["summary"] + assert "TAIL_MARKER" in r["summary"] + path = r.get("summary_full_path") + assert path and os.path.exists(path) + # The spill file holds the FULL original text — nothing is lost. + with open(path, encoding="utf-8") as fh: + assert fh.read() == big + # The footer points the parent at the full version with an offset. + assert "read_file" in r["summary"] + assert "offset=" in r["summary"] + # Spilled into the delegation cache (mounted into remote backends). + assert os.path.join("cache", "delegation") in path + + +def test_dynamic_budget_shrinks_as_batch_grows(): + def cap_for(n): + return dt._parent_summary_char_budget( + _FakeParent(131_000, 30_000, 8_000), n + ) + + c1, c5, c20 = cap_for(1), cap_for(5), cap_for(20) + assert c1 is not None and c5 is not None and c20 is not None + # More children → smaller per-summary slice of the same headroom. + assert c1 > c5 > c20 + + +def test_floor_enforced_when_parent_over_budget(): + # Parent already over its context budget → each summary gets only the floor. + budget = dt._parent_summary_char_budget( + _FakeParent(131_000, 200_000, 8_000), 3 + ) + assert budget == dt._MIN_SUMMARY_CHARS + + +def test_unknown_context_falls_back_to_static_ceiling(monkeypatch): + class _Bare: + pass + + # No compressor → dynamic budget is unknowable. + assert dt._parent_summary_char_budget(_Bare(), 3) is None + + # But the static delegation.max_summary_chars ceiling still trims. + with tempfile.TemporaryDirectory() as td: + monkeypatch.setenv("HERMES_HOME", os.path.join(td, ".hermes")) + results = [{"task_index": 0, "summary": "Y" * 40_000, "status": "completed"}] + dt._apply_summary_budget(results, _Bare()) + assert results[0]["summary_truncated"] is True + assert len(results[0]["summary"]) < 40_000 + + +def test_disabled_static_ceiling_and_unknown_context_leaves_summary_intact(monkeypatch): + class _Bare: + pass + + # Both caps off: static ceiling 0 (disabled) AND no compressor (no dynamic). + monkeypatch.setattr(dt, "_load_config", lambda: {"max_summary_chars": 0}) + results = [{"task_index": 0, "summary": "Z" * 40_000, "status": "completed"}] + dt._apply_summary_budget(results, _Bare()) + assert "summary_truncated" not in results[0] + assert len(results[0]["summary"]) == 40_000 + + +def test_empty_results_is_noop(): + # No summaries → nothing to do, must not raise. + dt._apply_summary_budget([], _FakeParent(131_000, 1_000, 8_000)) + dt._apply_summary_budget( + [{"task_index": 0, "status": "failed", "summary": None}], + _FakeParent(131_000, 1_000, 8_000), + ) diff --git a/tools/credential_files.py b/tools/credential_files.py index b7f1ff773e8..885e0e37d9f 100644 --- a/tools/credential_files.py +++ b/tools/credential_files.py @@ -350,6 +350,7 @@ _CACHE_DIRS: list[tuple[str, str]] = [ ("cache/videos", "video_cache"), ("cache/screenshots", "browser_screenshots"), ("cache/web", "web_cache"), + ("cache/delegation", "delegation_cache"), ] diff --git a/tools/delegate_tool.py b/tools/delegate_tool.py index 9de91b671b9..49a76318254 100644 --- a/tools/delegate_tool.py +++ b/tools/delegate_tool.py @@ -601,6 +601,18 @@ def _preserve_parent_mcp_toolsets( DEFAULT_MAX_ITERATIONS = 50 +# Hard per-summary character ceiling layered on top of the dynamic +# headroom budget (see _apply_summary_budget). Belt-and-suspenders for +# models that ignore the "be concise" instruction. 0 disables the ceiling. +DEFAULT_MAX_SUMMARY_CHARS = 24000 +# Fraction of the parent's *remaining* context headroom that the whole batch +# of subagent summaries is allowed to consume. The per-summary budget is this +# slice divided across the batch, so N children can't collectively blow the +# parent's window (the compression/429 death-spiral in issue/PR #9126). +_SUMMARY_HEADROOM_FRACTION = 0.5 +# Floor so a single summary always gets a usable slice even when the parent is +# already nearly full — below this we'd be truncating to noise. +_MIN_SUMMARY_CHARS = 2000 # No default wall-clock cap on child agents: legitimate heavy subagent work # (deep reviews, research fan-outs, slow reasoning models) was being killed # mid-task. Errors should come from what the child actually does; stuck-child @@ -702,8 +714,10 @@ def _build_child_system_prompt( "- Any issues encountered\n\n" "Important workspace rule: Never assume a repository lives at /workspace/... or any other container-style path unless the task/context explicitly gives that path. " "If no exact local path is provided, discover it first before issuing git/workdir-specific commands.\n\n" - "Be thorough but concise -- your response is returned to the " - "parent agent as a summary." + "Keep your final summary tight: lead with outcomes, prefer bullet " + "points over paragraphs, and don't replay your whole process. Your " + "response is returned to the parent agent as a summary, and overlong " + "summaries crowd out the parent's context window." ) if role == "orchestrator": child_note = ( @@ -1509,6 +1523,181 @@ def _dump_subagent_timeout_diagnostic( return None +def _spill_summary_to_file(task_index: int, summary: str) -> Optional[str]: + """Write a subagent's full summary to the delegation cache and return path. + + Mirrors web_extract's ``_store_full_text``: the file lands in + ``cache/delegation`` which is mounted read-only into remote backends + (Docker/Modal/SSH) via ``credential_files._CACHE_DIRS``, so the parent's + terminal/``read_file`` tools can page through the complete text on any + backend. Returns the absolute path, or None on failure (best-effort: + the trimmed head+tail is still returned to the parent regardless). + """ + try: + from hermes_constants import get_hermes_dir + import datetime as _dt + + cache_dir = get_hermes_dir("cache/delegation", "delegation_cache") + cache_dir.mkdir(parents=True, exist_ok=True) + ts = _dt.datetime.now().strftime("%Y%m%d_%H%M%S_%f") + path = cache_dir / f"subagent-summary-{task_index}-{ts}.txt" + path.write_text(summary, encoding="utf-8") + return str(path) + except Exception as exc: + logger.debug("Failed to spill subagent summary to file: %s", exc) + return None + + +def _trim_summary_with_footer( + summary: str, cap: int, task_index: int +) -> tuple[str, Optional[str]]: + """Return (model_text, spill_path) for one over-budget summary. + + Mirrors web_extract's ``_truncate_with_footer``: keep a head+tail window + (~75% head / ~25% tail, snapped to line boundaries) so the subagent's + opening AND its closing (outcomes / files-changed / issues, which live at + the end) both survive, spill the full text to disk, and append a footer + telling the parent exactly how much it's seeing and the precise + ``read_file offset=`` to page into the omitted middle. Deterministic. + """ + original_len = len(summary) + head_budget = int(cap * 0.75) + tail_budget = cap - head_budget + + head = summary[:head_budget] + tail = summary[-tail_budget:] + # Snap the head cut back to the last newline so we don't slice mid-line. + nl = head.rfind("\n") + if nl > head_budget * 0.5: + head = head[:nl] + # Snap the tail cut forward to the next newline for the same reason. + nl = tail.find("\n") + if 0 <= nl < tail_budget * 0.5: + tail = tail[nl + 1:] + + spill_path = _spill_summary_to_file(task_index, summary) + + footer_lines = [ + "", + "─" * 8 + " [SUMMARY TRUNCATED] " + "─" * 8, + f"Showing {len(head):,} chars (head) + {len(tail):,} chars (tail) " + f"of {original_len:,} total — trimmed to protect the parent's context window.", + ] + if spill_path: + # read_file is 1-indexed; +2 moves past the last head line shown. + middle_start_line = head.count("\n") + 2 + footer_lines.append(f"Full subagent output saved to: {spill_path}") + footer_lines.append( + f'To read the omitted middle: read_file path="{spill_path}" ' + f"offset={middle_start_line} limit=200 (the file is the complete " + f"summary; raise/lower offset to page through it)." + ) + else: + footer_lines.append( + "Full output could not be stored to disk; the head+tail above is " + "all that was preserved." + ) + footer_lines.append("─" * 37) + + model_text = head + "\n\n[... middle omitted — see footer ...]\n\n" + tail + "\n".join(footer_lines) + return model_text, spill_path + + +def _parent_summary_char_budget(parent_agent, n_summaries: int) -> Optional[int]: + """Per-summary character budget sized against the parent's *remaining* + context headroom, split across the batch. + + The overflow this guards against is N summaries entering the parent + context at once (batch fan-out), not any single summary being large. We + take a fraction of the headroom the parent has left (resolved context + length minus what's already in its prompt) and divide it across the batch, + converting tokens→chars at the standard ~4 chars/token estimate. + + Returns the per-summary char budget, or None when the parent's context + state is unknown (no compressor / no token count) — in which case the + caller falls back to the static char ceiling only. + """ + try: + compressor = getattr(parent_agent, "context_compressor", None) + context_length = getattr(compressor, "context_length", None) + if not isinstance(context_length, int) or context_length <= 0: + return None + + used_tokens = getattr(parent_agent, "session_prompt_tokens", 0) + if not isinstance(used_tokens, (int, float)) or used_tokens < 0: + used_tokens = 0 + + # Reserve the compressor's output budget so we measure INPUT headroom. + reserved = getattr(compressor, "max_tokens", 0) or 0 + headroom_tokens = context_length - int(used_tokens) - int(reserved) + if headroom_tokens <= 0: + # Parent is already over budget — give each summary only the floor. + return _MIN_SUMMARY_CHARS + + batch_token_budget = int(headroom_tokens * _SUMMARY_HEADROOM_FRACTION) + per_summary_tokens = batch_token_budget // max(1, n_summaries) + per_summary_chars = per_summary_tokens * 4 # ~4 chars/token + return max(_MIN_SUMMARY_CHARS, per_summary_chars) + except Exception: + logger.debug("Summary budget computation failed", exc_info=True) + return None + + +def _apply_summary_budget(results: List[Dict[str, Any]], parent_agent) -> None: + """Trim subagent summaries in-place so the batch can't overflow the + parent's context window, spilling full text to disk so nothing is lost. + + The effective per-summary cap is the MIN of: + - the dynamic headroom budget (remaining parent context ÷ batch size), and + - the static ``delegation.max_summary_chars`` ceiling (0 = disabled). + + When a summary exceeds the cap, its full text is written to a file and the + in-context summary becomes a head slice plus a pointer to that file. This + addresses issue/PR #9126: batch fan-out returned N full summaries verbatim, + blowing the parent context and (on rate-limited providers) triggering a + compression/429 death spiral. + """ + summaries = [ + r for r in results if isinstance(r, dict) and isinstance(r.get("summary"), str) and r["summary"] + ] + if not summaries: + return + + cfg = _load_config() + try: + static_ceiling = int(cfg.get("max_summary_chars", DEFAULT_MAX_SUMMARY_CHARS)) + except (TypeError, ValueError): + static_ceiling = DEFAULT_MAX_SUMMARY_CHARS + + dynamic_budget = _parent_summary_char_budget(parent_agent, len(summaries)) + + # Combine the two caps. Either can be absent/disabled. + candidates = [c for c in (static_ceiling, dynamic_budget) if c and c > 0] + if not candidates: + return # both disabled / unknown → leave summaries untouched + cap = min(candidates) + + for entry in summaries: + summary = entry["summary"] + if len(summary) <= cap: + continue + original_len = len(summary) + model_text, spill_path = _trim_summary_with_footer( + summary, cap, entry.get("task_index", -1) + ) + entry["summary"] = model_text + entry["summary_truncated"] = True + if spill_path: + entry["summary_full_path"] = spill_path + logger.debug( + "[subagent-%s] summary trimmed %d → ~%d chars (spill=%s)", + entry.get("task_index", "?"), + original_len, + cap, + spill_path or "none", + ) + + def _run_single_child( task_index: int, goal: str, @@ -2437,6 +2626,12 @@ def delegate_task( # Sort by task_index so results match input order results.sort(key=lambda r: r["task_index"]) + # Cap subagent summaries against the parent's remaining context + # headroom (split across the batch) before they enter the parent's + # conversation. Full text is spilled to disk so nothing is lost. + # Covers both the single-task and batch paths. See PR #9126. + _apply_summary_budget(results, parent_agent) + # Notify parent's memory provider of delegation outcomes if ( parent_agent