fix(delegation): budget subagent summaries against parent context headroom

Batch delegation returned each subagent's full final_response verbatim into the parent's context. A fan-out of N children could dump 60k+ tokens at once, blowing the parent's context window and — on rate-limited providers — triggering a compression/429 death spiral (429 misread as context-too-large -> window step-down -> retry loop -> conversation dies). Cap each summary against the parent's *remaining* context headroom split across the batch (not a magic char count). When trimming, mirror the web_extract convention: spill the full text to cache/delegation (mounted into remote backends via credential_files._CACHE_DIRS) and return a head+tail window (75/25, line-snapped) plus a footer with the exact read_file offset to page the omitted middle. Both the subagent's opening AND its closing (outcomes / files-changed / issues, which live at the end) survive in-context, and nothing is lost — the parent can read_file the full version on any backend. delegation.max_summary_chars (default 24000) is a static ceiling layered on top as belt-and-suspenders for models that ignore 'be concise'; 0 disables it. Child prompt tightened to lead with outcomes / bullets. Co-authored-by: rc-int <rcint@klaith.com>
2026-07-01 12:02:05 +00:00 · 2026-06-30 01:30:47 -07:00 · 2026-06-30 01:30:47 -07:00 · 35a0803a3b
commit 35a0803a3b
parent 3b2bb30c5d
5 changed files with 341 additions and 2 deletions
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -2087,6 +2087,22 @@ DEFAULT_CONFIG = {
        "inherit_mcp_toolsets": True,
        "max_iterations": 50,  # per-subagent iteration cap (each subagent gets its own budget,
                               # independent of the parent's max_iterations)
+        # Subagent summaries return to the parent's context verbatim. A batch
+        # fan-out (N children) returns N summaries at once, which can exceed
+        # the parent's context window and trigger a compression/429 death
+        # spiral. delegate_task sizes each summary against the parent's
+        # remaining context headroom (split across the batch); when it must
+        # trim, the full text is spilled to ~/.hermes/cache/delegation/
+        # (mounted into remote backends) and the in-context summary becomes a
+        # head+tail window plus a footer with the exact read_file offset to
+        # page the omitted middle — the same convention web_extract uses for
+        # large pages. Nothing is lost. max_summary_chars is a hard per-summary
+        # character ceiling layered on top of that dynamic budget
+        # (belt-and-suspenders for models that ignore the "be concise"
+        # instruction). 0 disables the hard ceiling; the dynamic headroom
+        # budget still applies.
+        "max_summary_chars": 24000,
+
        "child_timeout_seconds": 0,  # optional wall-clock cap per child agent. 0 (default)
                                     # = no timeout: children fail only from real errors
                                     # (API, tools, iteration budget), never a delegation
--- a/scripts/release.py
+++ b/scripts/release.py
@ -79,6 +79,7 @@ AUTHOR_MAP = {
    "15205536+595650661@users.noreply.github.com": "595650661",  # PR #37851 salvage (classify MiniMax new_sensitive content filter → content_policy_blocked; #32421)
    "benbenwyb@gmail.com": "benbenlijie",  # PR #47205 salvage (named custom-provider extra_body + Z.AI Coding overload adaptive backoff; #50663)
    "dana@added-value.co.il": "Danamove",  # PR #46726 salvage (kill venv-resident pythonw gateway before recreating venv on Windows; #47036/#47557/#47910)
+    "rcint@klaith.com": "rc-int",  # PR #9126 salvage / co-author (cap subagent summary size vs parent context overflow)
    "145739220+wgu9@users.noreply.github.com": "wgu9",  # PR #51468 salvage (WSL/no-systemd orphan gateway tracking, #51325)
    "165020384+uperLu@users.noreply.github.com": "uperLu",  # PR #50958 salvage (rename plugins/cron → plugins/cron_providers; #50872)
    "277269729+yusekiotacode@users.noreply.github.com": "yusekiotacode",  # PR #48706 salvage (anthropic OAuth login token endpoint → platform.claude.com; #45250/#49821)
--- a/tests/tools/test_delegate_summary_budget.py
+++ b/tests/tools/test_delegate_summary_budget.py
@ -0,0 +1,126 @@
+"""Tests for subagent summary budgeting (PR #9126).
+
+delegate_task caps subagent summaries against the parent's remaining context
+headroom (split across the batch) before they enter the parent's context, and
+spills the full text to disk so nothing is lost. This guards the
+compression/429 death spiral that batch fan-out could trigger by returning N
+full summaries verbatim into the parent.
+"""
+
+import os
+import tempfile
+
+import pytest
+
+import tools.delegate_tool as dt
+
+
+class _FakeCompressor:
+    def __init__(self, context_length, max_tokens):
+        self.context_length = context_length
+        self.max_tokens = max_tokens
+
+
+class _FakeParent:
+    def __init__(self, context_length, used_tokens, max_tokens):
+        self.context_compressor = _FakeCompressor(context_length, max_tokens)
+        self.session_prompt_tokens = used_tokens
+
+
+def test_small_summaries_pass_through_untouched():
+    parent = _FakeParent(context_length=200_000, used_tokens=10_000, max_tokens=8_000)
+    results = [
+        {"task_index": 0, "summary": "short result A", "status": "completed"},
+        {"task_index": 1, "summary": "short result B", "status": "completed"},
+    ]
+    dt._apply_summary_budget(results, parent)
+    assert results[0]["summary"] == "short result A"
+    assert "summary_truncated" not in results[0]
+    assert "summary_truncated" not in results[1]
+
+
+def test_batch_overflow_trimmed_and_spilled_losslessly(monkeypatch):
+    # Isolate spill directory to a temp HERMES_HOME.
+    with tempfile.TemporaryDirectory() as td:
+        monkeypatch.setenv("HERMES_HOME", os.path.join(td, ".hermes"))
+        # Distinct head + tail markers so we can prove the tail survives.
+        big = "HEAD_MARKER\n" + ("X" * 50_000) + "\nTAIL_MARKER"
+        # Parent nearly full (120k/131k) → tiny headroom → aggressive trim.
+        parent = _FakeParent(context_length=131_000, used_tokens=120_000, max_tokens=8_000)
+        results = [
+            {"task_index": i, "summary": big, "status": "completed"} for i in range(5)
+        ]
+        dt._apply_summary_budget(results, parent)
+        for r in results:
+            assert r["summary_truncated"] is True
+            assert len(r["summary"]) < len(big)
+            # Head+tail window: both ends survive in-context.
+            assert "HEAD_MARKER" in r["summary"]
+            assert "TAIL_MARKER" in r["summary"]
+            path = r.get("summary_full_path")
+            assert path and os.path.exists(path)
+            # The spill file holds the FULL original text — nothing is lost.
+            with open(path, encoding="utf-8") as fh:
+                assert fh.read() == big
+            # The footer points the parent at the full version with an offset.
+            assert "read_file" in r["summary"]
+            assert "offset=" in r["summary"]
+            # Spilled into the delegation cache (mounted into remote backends).
+            assert os.path.join("cache", "delegation") in path
+
+
+def test_dynamic_budget_shrinks_as_batch_grows():
+    def cap_for(n):
+        return dt._parent_summary_char_budget(
+            _FakeParent(131_000, 30_000, 8_000), n
+        )
+
+    c1, c5, c20 = cap_for(1), cap_for(5), cap_for(20)
+    assert c1 is not None and c5 is not None and c20 is not None
+    # More children → smaller per-summary slice of the same headroom.
+    assert c1 > c5 > c20
+
+
+def test_floor_enforced_when_parent_over_budget():
+    # Parent already over its context budget → each summary gets only the floor.
+    budget = dt._parent_summary_char_budget(
+        _FakeParent(131_000, 200_000, 8_000), 3
+    )
+    assert budget == dt._MIN_SUMMARY_CHARS
+
+
+def test_unknown_context_falls_back_to_static_ceiling(monkeypatch):
+    class _Bare:
+        pass
+
+    # No compressor → dynamic budget is unknowable.
+    assert dt._parent_summary_char_budget(_Bare(), 3) is None
+
+    # But the static delegation.max_summary_chars ceiling still trims.
+    with tempfile.TemporaryDirectory() as td:
+        monkeypatch.setenv("HERMES_HOME", os.path.join(td, ".hermes"))
+        results = [{"task_index": 0, "summary": "Y" * 40_000, "status": "completed"}]
+        dt._apply_summary_budget(results, _Bare())
+        assert results[0]["summary_truncated"] is True
+        assert len(results[0]["summary"]) < 40_000
+
+
+def test_disabled_static_ceiling_and_unknown_context_leaves_summary_intact(monkeypatch):
+    class _Bare:
+        pass
+
+    # Both caps off: static ceiling 0 (disabled) AND no compressor (no dynamic).
+    monkeypatch.setattr(dt, "_load_config", lambda: {"max_summary_chars": 0})
+    results = [{"task_index": 0, "summary": "Z" * 40_000, "status": "completed"}]
+    dt._apply_summary_budget(results, _Bare())
+    assert "summary_truncated" not in results[0]
+    assert len(results[0]["summary"]) == 40_000
+
+
+def test_empty_results_is_noop():
+    # No summaries → nothing to do, must not raise.
+    dt._apply_summary_budget([], _FakeParent(131_000, 1_000, 8_000))
+    dt._apply_summary_budget(
+        [{"task_index": 0, "status": "failed", "summary": None}],
+        _FakeParent(131_000, 1_000, 8_000),
+    )
--- a/tools/credential_files.py
+++ b/tools/credential_files.py
@ -350,6 +350,7 @@ _CACHE_DIRS: list[tuple[str, str]] = [
    ("cache/videos", "video_cache"),
    ("cache/screenshots", "browser_screenshots"),
    ("cache/web", "web_cache"),
+    ("cache/delegation", "delegation_cache"),
 ]


--- a/tools/delegate_tool.py
+++ b/tools/delegate_tool.py
@ -601,6 +601,18 @@ def _preserve_parent_mcp_toolsets(


 DEFAULT_MAX_ITERATIONS = 50
+# Hard per-summary character ceiling layered on top of the dynamic
+# headroom budget (see _apply_summary_budget). Belt-and-suspenders for
+# models that ignore the "be concise" instruction. 0 disables the ceiling.
+DEFAULT_MAX_SUMMARY_CHARS = 24000
+# Fraction of the parent's *remaining* context headroom that the whole batch
+# of subagent summaries is allowed to consume. The per-summary budget is this
+# slice divided across the batch, so N children can't collectively blow the
+# parent's window (the compression/429 death-spiral in issue/PR #9126).
+_SUMMARY_HEADROOM_FRACTION = 0.5
+# Floor so a single summary always gets a usable slice even when the parent is
+# already nearly full — below this we'd be truncating to noise.
+_MIN_SUMMARY_CHARS = 2000
 # No default wall-clock cap on child agents: legitimate heavy subagent work
 # (deep reviews, research fan-outs, slow reasoning models) was being killed
 # mid-task. Errors should come from what the child actually does; stuck-child
@ -702,8 +714,10 @@ def _build_child_system_prompt(
        "- Any issues encountered\n\n"
        "Important workspace rule: Never assume a repository lives at /workspace/... or any other container-style path unless the task/context explicitly gives that path. "
        "If no exact local path is provided, discover it first before issuing git/workdir-specific commands.\n\n"
-        "Be thorough but concise -- your response is returned to the "
-        "parent agent as a summary."
+        "Keep your final summary tight: lead with outcomes, prefer bullet "
+        "points over paragraphs, and don't replay your whole process. Your "
+        "response is returned to the parent agent as a summary, and overlong "
+        "summaries crowd out the parent's context window."
    )
    if role == "orchestrator":
        child_note = (
@ -1509,6 +1523,181 @@ def _dump_subagent_timeout_diagnostic(
        return None


+def _spill_summary_to_file(task_index: int, summary: str) -> Optional[str]:
+    """Write a subagent's full summary to the delegation cache and return path.
+
+    Mirrors web_extract's ``_store_full_text``: the file lands in
+    ``cache/delegation`` which is mounted read-only into remote backends
+    (Docker/Modal/SSH) via ``credential_files._CACHE_DIRS``, so the parent's
+    terminal/``read_file`` tools can page through the complete text on any
+    backend. Returns the absolute path, or None on failure (best-effort:
+    the trimmed head+tail is still returned to the parent regardless).
+    """
+    try:
+        from hermes_constants import get_hermes_dir
+        import datetime as _dt
+
+        cache_dir = get_hermes_dir("cache/delegation", "delegation_cache")
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        ts = _dt.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        path = cache_dir / f"subagent-summary-{task_index}-{ts}.txt"
+        path.write_text(summary, encoding="utf-8")
+        return str(path)
+    except Exception as exc:
+        logger.debug("Failed to spill subagent summary to file: %s", exc)
+        return None
+
+
+def _trim_summary_with_footer(
+    summary: str, cap: int, task_index: int
+) -> tuple[str, Optional[str]]:
+    """Return (model_text, spill_path) for one over-budget summary.
+
+    Mirrors web_extract's ``_truncate_with_footer``: keep a head+tail window
+    (~75% head / ~25% tail, snapped to line boundaries) so the subagent's
+    opening AND its closing (outcomes / files-changed / issues, which live at
+    the end) both survive, spill the full text to disk, and append a footer
+    telling the parent exactly how much it's seeing and the precise
+    ``read_file offset=`` to page into the omitted middle. Deterministic.
+    """
+    original_len = len(summary)
+    head_budget = int(cap * 0.75)
+    tail_budget = cap - head_budget
+
+    head = summary[:head_budget]
+    tail = summary[-tail_budget:]
+    # Snap the head cut back to the last newline so we don't slice mid-line.
+    nl = head.rfind("\n")
+    if nl > head_budget * 0.5:
+        head = head[:nl]
+    # Snap the tail cut forward to the next newline for the same reason.
+    nl = tail.find("\n")
+    if 0 <= nl < tail_budget * 0.5:
+        tail = tail[nl + 1:]
+
+    spill_path = _spill_summary_to_file(task_index, summary)
+
+    footer_lines = [
+        "",
+        "─" * 8 + " [SUMMARY TRUNCATED] " + "─" * 8,
+        f"Showing {len(head):,} chars (head) + {len(tail):,} chars (tail) "
+        f"of {original_len:,} total — trimmed to protect the parent's context window.",
+    ]
+    if spill_path:
+        # read_file is 1-indexed; +2 moves past the last head line shown.
+        middle_start_line = head.count("\n") + 2
+        footer_lines.append(f"Full subagent output saved to: {spill_path}")
+        footer_lines.append(
+            f'To read the omitted middle: read_file path="{spill_path}" '
+            f"offset={middle_start_line} limit=200  (the file is the complete "
+            f"summary; raise/lower offset to page through it)."
+        )
+    else:
+        footer_lines.append(
+            "Full output could not be stored to disk; the head+tail above is "
+            "all that was preserved."
+        )
+    footer_lines.append("─" * 37)
+
+    model_text = head + "\n\n[... middle omitted — see footer ...]\n\n" + tail + "\n".join(footer_lines)
+    return model_text, spill_path
+
+
+def _parent_summary_char_budget(parent_agent, n_summaries: int) -> Optional[int]:
+    """Per-summary character budget sized against the parent's *remaining*
+    context headroom, split across the batch.
+
+    The overflow this guards against is N summaries entering the parent
+    context at once (batch fan-out), not any single summary being large.  We
+    take a fraction of the headroom the parent has left (resolved context
+    length minus what's already in its prompt) and divide it across the batch,
+    converting tokens→chars at the standard ~4 chars/token estimate.
+
+    Returns the per-summary char budget, or None when the parent's context
+    state is unknown (no compressor / no token count) — in which case the
+    caller falls back to the static char ceiling only.
+    """
+    try:
+        compressor = getattr(parent_agent, "context_compressor", None)
+        context_length = getattr(compressor, "context_length", None)
+        if not isinstance(context_length, int) or context_length <= 0:
+            return None
+
+        used_tokens = getattr(parent_agent, "session_prompt_tokens", 0)
+        if not isinstance(used_tokens, (int, float)) or used_tokens < 0:
+            used_tokens = 0
+
+        # Reserve the compressor's output budget so we measure INPUT headroom.
+        reserved = getattr(compressor, "max_tokens", 0) or 0
+        headroom_tokens = context_length - int(used_tokens) - int(reserved)
+        if headroom_tokens <= 0:
+            # Parent is already over budget — give each summary only the floor.
+            return _MIN_SUMMARY_CHARS
+
+        batch_token_budget = int(headroom_tokens * _SUMMARY_HEADROOM_FRACTION)
+        per_summary_tokens = batch_token_budget // max(1, n_summaries)
+        per_summary_chars = per_summary_tokens * 4  # ~4 chars/token
+        return max(_MIN_SUMMARY_CHARS, per_summary_chars)
+    except Exception:
+        logger.debug("Summary budget computation failed", exc_info=True)
+        return None
+
+
+def _apply_summary_budget(results: List[Dict[str, Any]], parent_agent) -> None:
+    """Trim subagent summaries in-place so the batch can't overflow the
+    parent's context window, spilling full text to disk so nothing is lost.
+
+    The effective per-summary cap is the MIN of:
+      - the dynamic headroom budget (remaining parent context ÷ batch size), and
+      - the static ``delegation.max_summary_chars`` ceiling (0 = disabled).
+
+    When a summary exceeds the cap, its full text is written to a file and the
+    in-context summary becomes a head slice plus a pointer to that file. This
+    addresses issue/PR #9126: batch fan-out returned N full summaries verbatim,
+    blowing the parent context and (on rate-limited providers) triggering a
+    compression/429 death spiral.
+    """
+    summaries = [
+        r for r in results if isinstance(r, dict) and isinstance(r.get("summary"), str) and r["summary"]
+    ]
+    if not summaries:
+        return
+
+    cfg = _load_config()
+    try:
+        static_ceiling = int(cfg.get("max_summary_chars", DEFAULT_MAX_SUMMARY_CHARS))
+    except (TypeError, ValueError):
+        static_ceiling = DEFAULT_MAX_SUMMARY_CHARS
+
+    dynamic_budget = _parent_summary_char_budget(parent_agent, len(summaries))
+
+    # Combine the two caps. Either can be absent/disabled.
+    candidates = [c for c in (static_ceiling, dynamic_budget) if c and c > 0]
+    if not candidates:
+        return  # both disabled / unknown → leave summaries untouched
+    cap = min(candidates)
+
+    for entry in summaries:
+        summary = entry["summary"]
+        if len(summary) <= cap:
+            continue
+        original_len = len(summary)
+        model_text, spill_path = _trim_summary_with_footer(
+            summary, cap, entry.get("task_index", -1)
+        )
+        entry["summary"] = model_text
+        entry["summary_truncated"] = True
+        if spill_path:
+            entry["summary_full_path"] = spill_path
+        logger.debug(
+            "[subagent-%s] summary trimmed %d → ~%d chars (spill=%s)",
+            entry.get("task_index", "?"),
+            original_len,
+            cap,
+            spill_path or "none",
+        )
+
+
 def _run_single_child(
    task_index: int,
    goal: str,
@ -2437,6 +2626,12 @@ def delegate_task(
            # Sort by task_index so results match input order
            results.sort(key=lambda r: r["task_index"])

+        # Cap subagent summaries against the parent's remaining context
+        # headroom (split across the batch) before they enter the parent's
+        # conversation. Full text is spilled to disk so nothing is lost.
+        # Covers both the single-task and batch paths. See PR #9126.
+        _apply_summary_budget(results, parent_agent)
+
        # Notify parent's memory provider of delegation outcomes
        if (
            parent_agent