mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
fix(delegation): budget subagent summaries against parent context headroom
Batch delegation returned each subagent's full final_response verbatim into the parent's context. A fan-out of N children could dump 60k+ tokens at once, blowing the parent's context window and — on rate-limited providers — triggering a compression/429 death spiral (429 misread as context-too-large -> window step-down -> retry loop -> conversation dies). Cap each summary against the parent's *remaining* context headroom split across the batch (not a magic char count). When trimming, mirror the web_extract convention: spill the full text to cache/delegation (mounted into remote backends via credential_files._CACHE_DIRS) and return a head+tail window (75/25, line-snapped) plus a footer with the exact read_file offset to page the omitted middle. Both the subagent's opening AND its closing (outcomes / files-changed / issues, which live at the end) survive in-context, and nothing is lost — the parent can read_file the full version on any backend. delegation.max_summary_chars (default 24000) is a static ceiling layered on top as belt-and-suspenders for models that ignore 'be concise'; 0 disables it. Child prompt tightened to lead with outcomes / bullets. Co-authored-by: rc-int <rcint@klaith.com>
This commit is contained in:
parent
3b2bb30c5d
commit
35a0803a3b
5 changed files with 341 additions and 2 deletions
|
|
@ -2087,6 +2087,22 @@ DEFAULT_CONFIG = {
|
|||
"inherit_mcp_toolsets": True,
|
||||
"max_iterations": 50, # per-subagent iteration cap (each subagent gets its own budget,
|
||||
# independent of the parent's max_iterations)
|
||||
# Subagent summaries return to the parent's context verbatim. A batch
|
||||
# fan-out (N children) returns N summaries at once, which can exceed
|
||||
# the parent's context window and trigger a compression/429 death
|
||||
# spiral. delegate_task sizes each summary against the parent's
|
||||
# remaining context headroom (split across the batch); when it must
|
||||
# trim, the full text is spilled to ~/.hermes/cache/delegation/
|
||||
# (mounted into remote backends) and the in-context summary becomes a
|
||||
# head+tail window plus a footer with the exact read_file offset to
|
||||
# page the omitted middle — the same convention web_extract uses for
|
||||
# large pages. Nothing is lost. max_summary_chars is a hard per-summary
|
||||
# character ceiling layered on top of that dynamic budget
|
||||
# (belt-and-suspenders for models that ignore the "be concise"
|
||||
# instruction). 0 disables the hard ceiling; the dynamic headroom
|
||||
# budget still applies.
|
||||
"max_summary_chars": 24000,
|
||||
|
||||
"child_timeout_seconds": 0, # optional wall-clock cap per child agent. 0 (default)
|
||||
# = no timeout: children fail only from real errors
|
||||
# (API, tools, iteration budget), never a delegation
|
||||
|
|
|
|||
|
|
@ -79,6 +79,7 @@ AUTHOR_MAP = {
|
|||
"15205536+595650661@users.noreply.github.com": "595650661", # PR #37851 salvage (classify MiniMax new_sensitive content filter → content_policy_blocked; #32421)
|
||||
"benbenwyb@gmail.com": "benbenlijie", # PR #47205 salvage (named custom-provider extra_body + Z.AI Coding overload adaptive backoff; #50663)
|
||||
"dana@added-value.co.il": "Danamove", # PR #46726 salvage (kill venv-resident pythonw gateway before recreating venv on Windows; #47036/#47557/#47910)
|
||||
"rcint@klaith.com": "rc-int", # PR #9126 salvage / co-author (cap subagent summary size vs parent context overflow)
|
||||
"145739220+wgu9@users.noreply.github.com": "wgu9", # PR #51468 salvage (WSL/no-systemd orphan gateway tracking, #51325)
|
||||
"165020384+uperLu@users.noreply.github.com": "uperLu", # PR #50958 salvage (rename plugins/cron → plugins/cron_providers; #50872)
|
||||
"277269729+yusekiotacode@users.noreply.github.com": "yusekiotacode", # PR #48706 salvage (anthropic OAuth login token endpoint → platform.claude.com; #45250/#49821)
|
||||
|
|
|
|||
126
tests/tools/test_delegate_summary_budget.py
Normal file
126
tests/tools/test_delegate_summary_budget.py
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
"""Tests for subagent summary budgeting (PR #9126).
|
||||
|
||||
delegate_task caps subagent summaries against the parent's remaining context
|
||||
headroom (split across the batch) before they enter the parent's context, and
|
||||
spills the full text to disk so nothing is lost. This guards the
|
||||
compression/429 death spiral that batch fan-out could trigger by returning N
|
||||
full summaries verbatim into the parent.
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
import tools.delegate_tool as dt
|
||||
|
||||
|
||||
class _FakeCompressor:
|
||||
def __init__(self, context_length, max_tokens):
|
||||
self.context_length = context_length
|
||||
self.max_tokens = max_tokens
|
||||
|
||||
|
||||
class _FakeParent:
|
||||
def __init__(self, context_length, used_tokens, max_tokens):
|
||||
self.context_compressor = _FakeCompressor(context_length, max_tokens)
|
||||
self.session_prompt_tokens = used_tokens
|
||||
|
||||
|
||||
def test_small_summaries_pass_through_untouched():
|
||||
parent = _FakeParent(context_length=200_000, used_tokens=10_000, max_tokens=8_000)
|
||||
results = [
|
||||
{"task_index": 0, "summary": "short result A", "status": "completed"},
|
||||
{"task_index": 1, "summary": "short result B", "status": "completed"},
|
||||
]
|
||||
dt._apply_summary_budget(results, parent)
|
||||
assert results[0]["summary"] == "short result A"
|
||||
assert "summary_truncated" not in results[0]
|
||||
assert "summary_truncated" not in results[1]
|
||||
|
||||
|
||||
def test_batch_overflow_trimmed_and_spilled_losslessly(monkeypatch):
|
||||
# Isolate spill directory to a temp HERMES_HOME.
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
monkeypatch.setenv("HERMES_HOME", os.path.join(td, ".hermes"))
|
||||
# Distinct head + tail markers so we can prove the tail survives.
|
||||
big = "HEAD_MARKER\n" + ("X" * 50_000) + "\nTAIL_MARKER"
|
||||
# Parent nearly full (120k/131k) → tiny headroom → aggressive trim.
|
||||
parent = _FakeParent(context_length=131_000, used_tokens=120_000, max_tokens=8_000)
|
||||
results = [
|
||||
{"task_index": i, "summary": big, "status": "completed"} for i in range(5)
|
||||
]
|
||||
dt._apply_summary_budget(results, parent)
|
||||
for r in results:
|
||||
assert r["summary_truncated"] is True
|
||||
assert len(r["summary"]) < len(big)
|
||||
# Head+tail window: both ends survive in-context.
|
||||
assert "HEAD_MARKER" in r["summary"]
|
||||
assert "TAIL_MARKER" in r["summary"]
|
||||
path = r.get("summary_full_path")
|
||||
assert path and os.path.exists(path)
|
||||
# The spill file holds the FULL original text — nothing is lost.
|
||||
with open(path, encoding="utf-8") as fh:
|
||||
assert fh.read() == big
|
||||
# The footer points the parent at the full version with an offset.
|
||||
assert "read_file" in r["summary"]
|
||||
assert "offset=" in r["summary"]
|
||||
# Spilled into the delegation cache (mounted into remote backends).
|
||||
assert os.path.join("cache", "delegation") in path
|
||||
|
||||
|
||||
def test_dynamic_budget_shrinks_as_batch_grows():
|
||||
def cap_for(n):
|
||||
return dt._parent_summary_char_budget(
|
||||
_FakeParent(131_000, 30_000, 8_000), n
|
||||
)
|
||||
|
||||
c1, c5, c20 = cap_for(1), cap_for(5), cap_for(20)
|
||||
assert c1 is not None and c5 is not None and c20 is not None
|
||||
# More children → smaller per-summary slice of the same headroom.
|
||||
assert c1 > c5 > c20
|
||||
|
||||
|
||||
def test_floor_enforced_when_parent_over_budget():
|
||||
# Parent already over its context budget → each summary gets only the floor.
|
||||
budget = dt._parent_summary_char_budget(
|
||||
_FakeParent(131_000, 200_000, 8_000), 3
|
||||
)
|
||||
assert budget == dt._MIN_SUMMARY_CHARS
|
||||
|
||||
|
||||
def test_unknown_context_falls_back_to_static_ceiling(monkeypatch):
|
||||
class _Bare:
|
||||
pass
|
||||
|
||||
# No compressor → dynamic budget is unknowable.
|
||||
assert dt._parent_summary_char_budget(_Bare(), 3) is None
|
||||
|
||||
# But the static delegation.max_summary_chars ceiling still trims.
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
monkeypatch.setenv("HERMES_HOME", os.path.join(td, ".hermes"))
|
||||
results = [{"task_index": 0, "summary": "Y" * 40_000, "status": "completed"}]
|
||||
dt._apply_summary_budget(results, _Bare())
|
||||
assert results[0]["summary_truncated"] is True
|
||||
assert len(results[0]["summary"]) < 40_000
|
||||
|
||||
|
||||
def test_disabled_static_ceiling_and_unknown_context_leaves_summary_intact(monkeypatch):
|
||||
class _Bare:
|
||||
pass
|
||||
|
||||
# Both caps off: static ceiling 0 (disabled) AND no compressor (no dynamic).
|
||||
monkeypatch.setattr(dt, "_load_config", lambda: {"max_summary_chars": 0})
|
||||
results = [{"task_index": 0, "summary": "Z" * 40_000, "status": "completed"}]
|
||||
dt._apply_summary_budget(results, _Bare())
|
||||
assert "summary_truncated" not in results[0]
|
||||
assert len(results[0]["summary"]) == 40_000
|
||||
|
||||
|
||||
def test_empty_results_is_noop():
|
||||
# No summaries → nothing to do, must not raise.
|
||||
dt._apply_summary_budget([], _FakeParent(131_000, 1_000, 8_000))
|
||||
dt._apply_summary_budget(
|
||||
[{"task_index": 0, "status": "failed", "summary": None}],
|
||||
_FakeParent(131_000, 1_000, 8_000),
|
||||
)
|
||||
|
|
@ -350,6 +350,7 @@ _CACHE_DIRS: list[tuple[str, str]] = [
|
|||
("cache/videos", "video_cache"),
|
||||
("cache/screenshots", "browser_screenshots"),
|
||||
("cache/web", "web_cache"),
|
||||
("cache/delegation", "delegation_cache"),
|
||||
]
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -601,6 +601,18 @@ def _preserve_parent_mcp_toolsets(
|
|||
|
||||
|
||||
DEFAULT_MAX_ITERATIONS = 50
|
||||
# Hard per-summary character ceiling layered on top of the dynamic
|
||||
# headroom budget (see _apply_summary_budget). Belt-and-suspenders for
|
||||
# models that ignore the "be concise" instruction. 0 disables the ceiling.
|
||||
DEFAULT_MAX_SUMMARY_CHARS = 24000
|
||||
# Fraction of the parent's *remaining* context headroom that the whole batch
|
||||
# of subagent summaries is allowed to consume. The per-summary budget is this
|
||||
# slice divided across the batch, so N children can't collectively blow the
|
||||
# parent's window (the compression/429 death-spiral in issue/PR #9126).
|
||||
_SUMMARY_HEADROOM_FRACTION = 0.5
|
||||
# Floor so a single summary always gets a usable slice even when the parent is
|
||||
# already nearly full — below this we'd be truncating to noise.
|
||||
_MIN_SUMMARY_CHARS = 2000
|
||||
# No default wall-clock cap on child agents: legitimate heavy subagent work
|
||||
# (deep reviews, research fan-outs, slow reasoning models) was being killed
|
||||
# mid-task. Errors should come from what the child actually does; stuck-child
|
||||
|
|
@ -702,8 +714,10 @@ def _build_child_system_prompt(
|
|||
"- Any issues encountered\n\n"
|
||||
"Important workspace rule: Never assume a repository lives at /workspace/... or any other container-style path unless the task/context explicitly gives that path. "
|
||||
"If no exact local path is provided, discover it first before issuing git/workdir-specific commands.\n\n"
|
||||
"Be thorough but concise -- your response is returned to the "
|
||||
"parent agent as a summary."
|
||||
"Keep your final summary tight: lead with outcomes, prefer bullet "
|
||||
"points over paragraphs, and don't replay your whole process. Your "
|
||||
"response is returned to the parent agent as a summary, and overlong "
|
||||
"summaries crowd out the parent's context window."
|
||||
)
|
||||
if role == "orchestrator":
|
||||
child_note = (
|
||||
|
|
@ -1509,6 +1523,181 @@ def _dump_subagent_timeout_diagnostic(
|
|||
return None
|
||||
|
||||
|
||||
def _spill_summary_to_file(task_index: int, summary: str) -> Optional[str]:
|
||||
"""Write a subagent's full summary to the delegation cache and return path.
|
||||
|
||||
Mirrors web_extract's ``_store_full_text``: the file lands in
|
||||
``cache/delegation`` which is mounted read-only into remote backends
|
||||
(Docker/Modal/SSH) via ``credential_files._CACHE_DIRS``, so the parent's
|
||||
terminal/``read_file`` tools can page through the complete text on any
|
||||
backend. Returns the absolute path, or None on failure (best-effort:
|
||||
the trimmed head+tail is still returned to the parent regardless).
|
||||
"""
|
||||
try:
|
||||
from hermes_constants import get_hermes_dir
|
||||
import datetime as _dt
|
||||
|
||||
cache_dir = get_hermes_dir("cache/delegation", "delegation_cache")
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
ts = _dt.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
||||
path = cache_dir / f"subagent-summary-{task_index}-{ts}.txt"
|
||||
path.write_text(summary, encoding="utf-8")
|
||||
return str(path)
|
||||
except Exception as exc:
|
||||
logger.debug("Failed to spill subagent summary to file: %s", exc)
|
||||
return None
|
||||
|
||||
|
||||
def _trim_summary_with_footer(
|
||||
summary: str, cap: int, task_index: int
|
||||
) -> tuple[str, Optional[str]]:
|
||||
"""Return (model_text, spill_path) for one over-budget summary.
|
||||
|
||||
Mirrors web_extract's ``_truncate_with_footer``: keep a head+tail window
|
||||
(~75% head / ~25% tail, snapped to line boundaries) so the subagent's
|
||||
opening AND its closing (outcomes / files-changed / issues, which live at
|
||||
the end) both survive, spill the full text to disk, and append a footer
|
||||
telling the parent exactly how much it's seeing and the precise
|
||||
``read_file offset=`` to page into the omitted middle. Deterministic.
|
||||
"""
|
||||
original_len = len(summary)
|
||||
head_budget = int(cap * 0.75)
|
||||
tail_budget = cap - head_budget
|
||||
|
||||
head = summary[:head_budget]
|
||||
tail = summary[-tail_budget:]
|
||||
# Snap the head cut back to the last newline so we don't slice mid-line.
|
||||
nl = head.rfind("\n")
|
||||
if nl > head_budget * 0.5:
|
||||
head = head[:nl]
|
||||
# Snap the tail cut forward to the next newline for the same reason.
|
||||
nl = tail.find("\n")
|
||||
if 0 <= nl < tail_budget * 0.5:
|
||||
tail = tail[nl + 1:]
|
||||
|
||||
spill_path = _spill_summary_to_file(task_index, summary)
|
||||
|
||||
footer_lines = [
|
||||
"",
|
||||
"─" * 8 + " [SUMMARY TRUNCATED] " + "─" * 8,
|
||||
f"Showing {len(head):,} chars (head) + {len(tail):,} chars (tail) "
|
||||
f"of {original_len:,} total — trimmed to protect the parent's context window.",
|
||||
]
|
||||
if spill_path:
|
||||
# read_file is 1-indexed; +2 moves past the last head line shown.
|
||||
middle_start_line = head.count("\n") + 2
|
||||
footer_lines.append(f"Full subagent output saved to: {spill_path}")
|
||||
footer_lines.append(
|
||||
f'To read the omitted middle: read_file path="{spill_path}" '
|
||||
f"offset={middle_start_line} limit=200 (the file is the complete "
|
||||
f"summary; raise/lower offset to page through it)."
|
||||
)
|
||||
else:
|
||||
footer_lines.append(
|
||||
"Full output could not be stored to disk; the head+tail above is "
|
||||
"all that was preserved."
|
||||
)
|
||||
footer_lines.append("─" * 37)
|
||||
|
||||
model_text = head + "\n\n[... middle omitted — see footer ...]\n\n" + tail + "\n".join(footer_lines)
|
||||
return model_text, spill_path
|
||||
|
||||
|
||||
def _parent_summary_char_budget(parent_agent, n_summaries: int) -> Optional[int]:
|
||||
"""Per-summary character budget sized against the parent's *remaining*
|
||||
context headroom, split across the batch.
|
||||
|
||||
The overflow this guards against is N summaries entering the parent
|
||||
context at once (batch fan-out), not any single summary being large. We
|
||||
take a fraction of the headroom the parent has left (resolved context
|
||||
length minus what's already in its prompt) and divide it across the batch,
|
||||
converting tokens→chars at the standard ~4 chars/token estimate.
|
||||
|
||||
Returns the per-summary char budget, or None when the parent's context
|
||||
state is unknown (no compressor / no token count) — in which case the
|
||||
caller falls back to the static char ceiling only.
|
||||
"""
|
||||
try:
|
||||
compressor = getattr(parent_agent, "context_compressor", None)
|
||||
context_length = getattr(compressor, "context_length", None)
|
||||
if not isinstance(context_length, int) or context_length <= 0:
|
||||
return None
|
||||
|
||||
used_tokens = getattr(parent_agent, "session_prompt_tokens", 0)
|
||||
if not isinstance(used_tokens, (int, float)) or used_tokens < 0:
|
||||
used_tokens = 0
|
||||
|
||||
# Reserve the compressor's output budget so we measure INPUT headroom.
|
||||
reserved = getattr(compressor, "max_tokens", 0) or 0
|
||||
headroom_tokens = context_length - int(used_tokens) - int(reserved)
|
||||
if headroom_tokens <= 0:
|
||||
# Parent is already over budget — give each summary only the floor.
|
||||
return _MIN_SUMMARY_CHARS
|
||||
|
||||
batch_token_budget = int(headroom_tokens * _SUMMARY_HEADROOM_FRACTION)
|
||||
per_summary_tokens = batch_token_budget // max(1, n_summaries)
|
||||
per_summary_chars = per_summary_tokens * 4 # ~4 chars/token
|
||||
return max(_MIN_SUMMARY_CHARS, per_summary_chars)
|
||||
except Exception:
|
||||
logger.debug("Summary budget computation failed", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
def _apply_summary_budget(results: List[Dict[str, Any]], parent_agent) -> None:
|
||||
"""Trim subagent summaries in-place so the batch can't overflow the
|
||||
parent's context window, spilling full text to disk so nothing is lost.
|
||||
|
||||
The effective per-summary cap is the MIN of:
|
||||
- the dynamic headroom budget (remaining parent context ÷ batch size), and
|
||||
- the static ``delegation.max_summary_chars`` ceiling (0 = disabled).
|
||||
|
||||
When a summary exceeds the cap, its full text is written to a file and the
|
||||
in-context summary becomes a head slice plus a pointer to that file. This
|
||||
addresses issue/PR #9126: batch fan-out returned N full summaries verbatim,
|
||||
blowing the parent context and (on rate-limited providers) triggering a
|
||||
compression/429 death spiral.
|
||||
"""
|
||||
summaries = [
|
||||
r for r in results if isinstance(r, dict) and isinstance(r.get("summary"), str) and r["summary"]
|
||||
]
|
||||
if not summaries:
|
||||
return
|
||||
|
||||
cfg = _load_config()
|
||||
try:
|
||||
static_ceiling = int(cfg.get("max_summary_chars", DEFAULT_MAX_SUMMARY_CHARS))
|
||||
except (TypeError, ValueError):
|
||||
static_ceiling = DEFAULT_MAX_SUMMARY_CHARS
|
||||
|
||||
dynamic_budget = _parent_summary_char_budget(parent_agent, len(summaries))
|
||||
|
||||
# Combine the two caps. Either can be absent/disabled.
|
||||
candidates = [c for c in (static_ceiling, dynamic_budget) if c and c > 0]
|
||||
if not candidates:
|
||||
return # both disabled / unknown → leave summaries untouched
|
||||
cap = min(candidates)
|
||||
|
||||
for entry in summaries:
|
||||
summary = entry["summary"]
|
||||
if len(summary) <= cap:
|
||||
continue
|
||||
original_len = len(summary)
|
||||
model_text, spill_path = _trim_summary_with_footer(
|
||||
summary, cap, entry.get("task_index", -1)
|
||||
)
|
||||
entry["summary"] = model_text
|
||||
entry["summary_truncated"] = True
|
||||
if spill_path:
|
||||
entry["summary_full_path"] = spill_path
|
||||
logger.debug(
|
||||
"[subagent-%s] summary trimmed %d → ~%d chars (spill=%s)",
|
||||
entry.get("task_index", "?"),
|
||||
original_len,
|
||||
cap,
|
||||
spill_path or "none",
|
||||
)
|
||||
|
||||
|
||||
def _run_single_child(
|
||||
task_index: int,
|
||||
goal: str,
|
||||
|
|
@ -2437,6 +2626,12 @@ def delegate_task(
|
|||
# Sort by task_index so results match input order
|
||||
results.sort(key=lambda r: r["task_index"])
|
||||
|
||||
# Cap subagent summaries against the parent's remaining context
|
||||
# headroom (split across the batch) before they enter the parent's
|
||||
# conversation. Full text is spilled to disk so nothing is lost.
|
||||
# Covers both the single-task and batch paths. See PR #9126.
|
||||
_apply_summary_budget(results, parent_agent)
|
||||
|
||||
# Notify parent's memory provider of delegation outcomes
|
||||
if (
|
||||
parent_agent
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue