fix(delegation): budget subagent summaries against parent context headroom

Batch delegation returned each subagent's full final_response verbatim
into the parent's context. A fan-out of N children could dump 60k+ tokens
at once, blowing the parent's context window and — on rate-limited
providers — triggering a compression/429 death spiral (429 misread as
context-too-large -> window step-down -> retry loop -> conversation dies).

Cap each summary against the parent's *remaining* context headroom split
across the batch (not a magic char count). When trimming, mirror the
web_extract convention: spill the full text to cache/delegation (mounted
into remote backends via credential_files._CACHE_DIRS) and return a
head+tail window (75/25, line-snapped) plus a footer with the exact
read_file offset to page the omitted middle. Both the subagent's opening
AND its closing (outcomes / files-changed / issues, which live at the end)
survive in-context, and nothing is lost — the parent can read_file the
full version on any backend.

delegation.max_summary_chars (default 24000) is a static ceiling layered
on top as belt-and-suspenders for models that ignore 'be concise'; 0
disables it. Child prompt tightened to lead with outcomes / bullets.

Co-authored-by: rc-int <rcint@klaith.com>
This commit is contained in:
teknium1 2026-06-30 01:30:47 -07:00 committed by Teknium
parent 3b2bb30c5d
commit 35a0803a3b
5 changed files with 341 additions and 2 deletions

View file

@ -2087,6 +2087,22 @@ DEFAULT_CONFIG = {
"inherit_mcp_toolsets": True,
"max_iterations": 50, # per-subagent iteration cap (each subagent gets its own budget,
# independent of the parent's max_iterations)
# Subagent summaries return to the parent's context verbatim. A batch
# fan-out (N children) returns N summaries at once, which can exceed
# the parent's context window and trigger a compression/429 death
# spiral. delegate_task sizes each summary against the parent's
# remaining context headroom (split across the batch); when it must
# trim, the full text is spilled to ~/.hermes/cache/delegation/
# (mounted into remote backends) and the in-context summary becomes a
# head+tail window plus a footer with the exact read_file offset to
# page the omitted middle — the same convention web_extract uses for
# large pages. Nothing is lost. max_summary_chars is a hard per-summary
# character ceiling layered on top of that dynamic budget
# (belt-and-suspenders for models that ignore the "be concise"
# instruction). 0 disables the hard ceiling; the dynamic headroom
# budget still applies.
"max_summary_chars": 24000,
"child_timeout_seconds": 0, # optional wall-clock cap per child agent. 0 (default)
# = no timeout: children fail only from real errors
# (API, tools, iteration budget), never a delegation

View file

@ -79,6 +79,7 @@ AUTHOR_MAP = {
"15205536+595650661@users.noreply.github.com": "595650661", # PR #37851 salvage (classify MiniMax new_sensitive content filter → content_policy_blocked; #32421)
"benbenwyb@gmail.com": "benbenlijie", # PR #47205 salvage (named custom-provider extra_body + Z.AI Coding overload adaptive backoff; #50663)
"dana@added-value.co.il": "Danamove", # PR #46726 salvage (kill venv-resident pythonw gateway before recreating venv on Windows; #47036/#47557/#47910)
"rcint@klaith.com": "rc-int", # PR #9126 salvage / co-author (cap subagent summary size vs parent context overflow)
"145739220+wgu9@users.noreply.github.com": "wgu9", # PR #51468 salvage (WSL/no-systemd orphan gateway tracking, #51325)
"165020384+uperLu@users.noreply.github.com": "uperLu", # PR #50958 salvage (rename plugins/cron → plugins/cron_providers; #50872)
"277269729+yusekiotacode@users.noreply.github.com": "yusekiotacode", # PR #48706 salvage (anthropic OAuth login token endpoint → platform.claude.com; #45250/#49821)

View file

@ -0,0 +1,126 @@
"""Tests for subagent summary budgeting (PR #9126).
delegate_task caps subagent summaries against the parent's remaining context
headroom (split across the batch) before they enter the parent's context, and
spills the full text to disk so nothing is lost. This guards the
compression/429 death spiral that batch fan-out could trigger by returning N
full summaries verbatim into the parent.
"""
import os
import tempfile
import pytest
import tools.delegate_tool as dt
class _FakeCompressor:
def __init__(self, context_length, max_tokens):
self.context_length = context_length
self.max_tokens = max_tokens
class _FakeParent:
def __init__(self, context_length, used_tokens, max_tokens):
self.context_compressor = _FakeCompressor(context_length, max_tokens)
self.session_prompt_tokens = used_tokens
def test_small_summaries_pass_through_untouched():
parent = _FakeParent(context_length=200_000, used_tokens=10_000, max_tokens=8_000)
results = [
{"task_index": 0, "summary": "short result A", "status": "completed"},
{"task_index": 1, "summary": "short result B", "status": "completed"},
]
dt._apply_summary_budget(results, parent)
assert results[0]["summary"] == "short result A"
assert "summary_truncated" not in results[0]
assert "summary_truncated" not in results[1]
def test_batch_overflow_trimmed_and_spilled_losslessly(monkeypatch):
# Isolate spill directory to a temp HERMES_HOME.
with tempfile.TemporaryDirectory() as td:
monkeypatch.setenv("HERMES_HOME", os.path.join(td, ".hermes"))
# Distinct head + tail markers so we can prove the tail survives.
big = "HEAD_MARKER\n" + ("X" * 50_000) + "\nTAIL_MARKER"
# Parent nearly full (120k/131k) → tiny headroom → aggressive trim.
parent = _FakeParent(context_length=131_000, used_tokens=120_000, max_tokens=8_000)
results = [
{"task_index": i, "summary": big, "status": "completed"} for i in range(5)
]
dt._apply_summary_budget(results, parent)
for r in results:
assert r["summary_truncated"] is True
assert len(r["summary"]) < len(big)
# Head+tail window: both ends survive in-context.
assert "HEAD_MARKER" in r["summary"]
assert "TAIL_MARKER" in r["summary"]
path = r.get("summary_full_path")
assert path and os.path.exists(path)
# The spill file holds the FULL original text — nothing is lost.
with open(path, encoding="utf-8") as fh:
assert fh.read() == big
# The footer points the parent at the full version with an offset.
assert "read_file" in r["summary"]
assert "offset=" in r["summary"]
# Spilled into the delegation cache (mounted into remote backends).
assert os.path.join("cache", "delegation") in path
def test_dynamic_budget_shrinks_as_batch_grows():
def cap_for(n):
return dt._parent_summary_char_budget(
_FakeParent(131_000, 30_000, 8_000), n
)
c1, c5, c20 = cap_for(1), cap_for(5), cap_for(20)
assert c1 is not None and c5 is not None and c20 is not None
# More children → smaller per-summary slice of the same headroom.
assert c1 > c5 > c20
def test_floor_enforced_when_parent_over_budget():
# Parent already over its context budget → each summary gets only the floor.
budget = dt._parent_summary_char_budget(
_FakeParent(131_000, 200_000, 8_000), 3
)
assert budget == dt._MIN_SUMMARY_CHARS
def test_unknown_context_falls_back_to_static_ceiling(monkeypatch):
class _Bare:
pass
# No compressor → dynamic budget is unknowable.
assert dt._parent_summary_char_budget(_Bare(), 3) is None
# But the static delegation.max_summary_chars ceiling still trims.
with tempfile.TemporaryDirectory() as td:
monkeypatch.setenv("HERMES_HOME", os.path.join(td, ".hermes"))
results = [{"task_index": 0, "summary": "Y" * 40_000, "status": "completed"}]
dt._apply_summary_budget(results, _Bare())
assert results[0]["summary_truncated"] is True
assert len(results[0]["summary"]) < 40_000
def test_disabled_static_ceiling_and_unknown_context_leaves_summary_intact(monkeypatch):
class _Bare:
pass
# Both caps off: static ceiling 0 (disabled) AND no compressor (no dynamic).
monkeypatch.setattr(dt, "_load_config", lambda: {"max_summary_chars": 0})
results = [{"task_index": 0, "summary": "Z" * 40_000, "status": "completed"}]
dt._apply_summary_budget(results, _Bare())
assert "summary_truncated" not in results[0]
assert len(results[0]["summary"]) == 40_000
def test_empty_results_is_noop():
# No summaries → nothing to do, must not raise.
dt._apply_summary_budget([], _FakeParent(131_000, 1_000, 8_000))
dt._apply_summary_budget(
[{"task_index": 0, "status": "failed", "summary": None}],
_FakeParent(131_000, 1_000, 8_000),
)

View file

@ -350,6 +350,7 @@ _CACHE_DIRS: list[tuple[str, str]] = [
("cache/videos", "video_cache"),
("cache/screenshots", "browser_screenshots"),
("cache/web", "web_cache"),
("cache/delegation", "delegation_cache"),
]

View file

@ -601,6 +601,18 @@ def _preserve_parent_mcp_toolsets(
DEFAULT_MAX_ITERATIONS = 50
# Hard per-summary character ceiling layered on top of the dynamic
# headroom budget (see _apply_summary_budget). Belt-and-suspenders for
# models that ignore the "be concise" instruction. 0 disables the ceiling.
DEFAULT_MAX_SUMMARY_CHARS = 24000
# Fraction of the parent's *remaining* context headroom that the whole batch
# of subagent summaries is allowed to consume. The per-summary budget is this
# slice divided across the batch, so N children can't collectively blow the
# parent's window (the compression/429 death-spiral in issue/PR #9126).
_SUMMARY_HEADROOM_FRACTION = 0.5
# Floor so a single summary always gets a usable slice even when the parent is
# already nearly full — below this we'd be truncating to noise.
_MIN_SUMMARY_CHARS = 2000
# No default wall-clock cap on child agents: legitimate heavy subagent work
# (deep reviews, research fan-outs, slow reasoning models) was being killed
# mid-task. Errors should come from what the child actually does; stuck-child
@ -702,8 +714,10 @@ def _build_child_system_prompt(
"- Any issues encountered\n\n"
"Important workspace rule: Never assume a repository lives at /workspace/... or any other container-style path unless the task/context explicitly gives that path. "
"If no exact local path is provided, discover it first before issuing git/workdir-specific commands.\n\n"
"Be thorough but concise -- your response is returned to the "
"parent agent as a summary."
"Keep your final summary tight: lead with outcomes, prefer bullet "
"points over paragraphs, and don't replay your whole process. Your "
"response is returned to the parent agent as a summary, and overlong "
"summaries crowd out the parent's context window."
)
if role == "orchestrator":
child_note = (
@ -1509,6 +1523,181 @@ def _dump_subagent_timeout_diagnostic(
return None
def _spill_summary_to_file(task_index: int, summary: str) -> Optional[str]:
"""Write a subagent's full summary to the delegation cache and return path.
Mirrors web_extract's ``_store_full_text``: the file lands in
``cache/delegation`` which is mounted read-only into remote backends
(Docker/Modal/SSH) via ``credential_files._CACHE_DIRS``, so the parent's
terminal/``read_file`` tools can page through the complete text on any
backend. Returns the absolute path, or None on failure (best-effort:
the trimmed head+tail is still returned to the parent regardless).
"""
try:
from hermes_constants import get_hermes_dir
import datetime as _dt
cache_dir = get_hermes_dir("cache/delegation", "delegation_cache")
cache_dir.mkdir(parents=True, exist_ok=True)
ts = _dt.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
path = cache_dir / f"subagent-summary-{task_index}-{ts}.txt"
path.write_text(summary, encoding="utf-8")
return str(path)
except Exception as exc:
logger.debug("Failed to spill subagent summary to file: %s", exc)
return None
def _trim_summary_with_footer(
summary: str, cap: int, task_index: int
) -> tuple[str, Optional[str]]:
"""Return (model_text, spill_path) for one over-budget summary.
Mirrors web_extract's ``_truncate_with_footer``: keep a head+tail window
(~75% head / ~25% tail, snapped to line boundaries) so the subagent's
opening AND its closing (outcomes / files-changed / issues, which live at
the end) both survive, spill the full text to disk, and append a footer
telling the parent exactly how much it's seeing and the precise
``read_file offset=`` to page into the omitted middle. Deterministic.
"""
original_len = len(summary)
head_budget = int(cap * 0.75)
tail_budget = cap - head_budget
head = summary[:head_budget]
tail = summary[-tail_budget:]
# Snap the head cut back to the last newline so we don't slice mid-line.
nl = head.rfind("\n")
if nl > head_budget * 0.5:
head = head[:nl]
# Snap the tail cut forward to the next newline for the same reason.
nl = tail.find("\n")
if 0 <= nl < tail_budget * 0.5:
tail = tail[nl + 1:]
spill_path = _spill_summary_to_file(task_index, summary)
footer_lines = [
"",
"" * 8 + " [SUMMARY TRUNCATED] " + "" * 8,
f"Showing {len(head):,} chars (head) + {len(tail):,} chars (tail) "
f"of {original_len:,} total — trimmed to protect the parent's context window.",
]
if spill_path:
# read_file is 1-indexed; +2 moves past the last head line shown.
middle_start_line = head.count("\n") + 2
footer_lines.append(f"Full subagent output saved to: {spill_path}")
footer_lines.append(
f'To read the omitted middle: read_file path="{spill_path}" '
f"offset={middle_start_line} limit=200 (the file is the complete "
f"summary; raise/lower offset to page through it)."
)
else:
footer_lines.append(
"Full output could not be stored to disk; the head+tail above is "
"all that was preserved."
)
footer_lines.append("" * 37)
model_text = head + "\n\n[... middle omitted — see footer ...]\n\n" + tail + "\n".join(footer_lines)
return model_text, spill_path
def _parent_summary_char_budget(parent_agent, n_summaries: int) -> Optional[int]:
"""Per-summary character budget sized against the parent's *remaining*
context headroom, split across the batch.
The overflow this guards against is N summaries entering the parent
context at once (batch fan-out), not any single summary being large. We
take a fraction of the headroom the parent has left (resolved context
length minus what's already in its prompt) and divide it across the batch,
converting tokenschars at the standard ~4 chars/token estimate.
Returns the per-summary char budget, or None when the parent's context
state is unknown (no compressor / no token count) in which case the
caller falls back to the static char ceiling only.
"""
try:
compressor = getattr(parent_agent, "context_compressor", None)
context_length = getattr(compressor, "context_length", None)
if not isinstance(context_length, int) or context_length <= 0:
return None
used_tokens = getattr(parent_agent, "session_prompt_tokens", 0)
if not isinstance(used_tokens, (int, float)) or used_tokens < 0:
used_tokens = 0
# Reserve the compressor's output budget so we measure INPUT headroom.
reserved = getattr(compressor, "max_tokens", 0) or 0
headroom_tokens = context_length - int(used_tokens) - int(reserved)
if headroom_tokens <= 0:
# Parent is already over budget — give each summary only the floor.
return _MIN_SUMMARY_CHARS
batch_token_budget = int(headroom_tokens * _SUMMARY_HEADROOM_FRACTION)
per_summary_tokens = batch_token_budget // max(1, n_summaries)
per_summary_chars = per_summary_tokens * 4 # ~4 chars/token
return max(_MIN_SUMMARY_CHARS, per_summary_chars)
except Exception:
logger.debug("Summary budget computation failed", exc_info=True)
return None
def _apply_summary_budget(results: List[Dict[str, Any]], parent_agent) -> None:
"""Trim subagent summaries in-place so the batch can't overflow the
parent's context window, spilling full text to disk so nothing is lost.
The effective per-summary cap is the MIN of:
- the dynamic headroom budget (remaining parent context ÷ batch size), and
- the static ``delegation.max_summary_chars`` ceiling (0 = disabled).
When a summary exceeds the cap, its full text is written to a file and the
in-context summary becomes a head slice plus a pointer to that file. This
addresses issue/PR #9126: batch fan-out returned N full summaries verbatim,
blowing the parent context and (on rate-limited providers) triggering a
compression/429 death spiral.
"""
summaries = [
r for r in results if isinstance(r, dict) and isinstance(r.get("summary"), str) and r["summary"]
]
if not summaries:
return
cfg = _load_config()
try:
static_ceiling = int(cfg.get("max_summary_chars", DEFAULT_MAX_SUMMARY_CHARS))
except (TypeError, ValueError):
static_ceiling = DEFAULT_MAX_SUMMARY_CHARS
dynamic_budget = _parent_summary_char_budget(parent_agent, len(summaries))
# Combine the two caps. Either can be absent/disabled.
candidates = [c for c in (static_ceiling, dynamic_budget) if c and c > 0]
if not candidates:
return # both disabled / unknown → leave summaries untouched
cap = min(candidates)
for entry in summaries:
summary = entry["summary"]
if len(summary) <= cap:
continue
original_len = len(summary)
model_text, spill_path = _trim_summary_with_footer(
summary, cap, entry.get("task_index", -1)
)
entry["summary"] = model_text
entry["summary_truncated"] = True
if spill_path:
entry["summary_full_path"] = spill_path
logger.debug(
"[subagent-%s] summary trimmed %d → ~%d chars (spill=%s)",
entry.get("task_index", "?"),
original_len,
cap,
spill_path or "none",
)
def _run_single_child(
task_index: int,
goal: str,
@ -2437,6 +2626,12 @@ def delegate_task(
# Sort by task_index so results match input order
results.sort(key=lambda r: r["task_index"])
# Cap subagent summaries against the parent's remaining context
# headroom (split across the batch) before they enter the parent's
# conversation. Full text is spilled to disk so nothing is lost.
# Covers both the single-task and batch paths. See PR #9126.
_apply_summary_budget(results, parent_agent)
# Notify parent's memory provider of delegation outcomes
if (
parent_agent