fix(delegate): resolve merge conflict with upstream thread capture

This commit is contained in:
MorAlekss 2026-04-25 03:13:07 -07:00
commit fda1325e79
522 changed files with 82059 additions and 4541 deletions

View file

@ -319,7 +319,7 @@ def _get_child_timeout() -> float:
"""Read delegation.child_timeout_seconds from config.
Returns the number of seconds a single child agent is allowed to run
before being considered stuck. Default: 300 s (5 minutes).
before being considered stuck. Default: 600 s (10 minutes).
"""
cfg = _load_config()
val = cfg.get("child_timeout_seconds")
@ -430,11 +430,17 @@ def _preserve_parent_mcp_toolsets(
DEFAULT_MAX_ITERATIONS = 50
DEFAULT_CHILD_TIMEOUT = 300 # seconds before a child agent is considered stuck
DEFAULT_CHILD_TIMEOUT = 600 # seconds before a child agent is considered stuck
_HEARTBEAT_INTERVAL = 30 # seconds between parent activity heartbeats during delegation
_HEARTBEAT_STALE_CYCLES = (
5 # mark child stale after this many heartbeats with no iteration progress
)
# Stale-heartbeat thresholds. A child with no API-call progress is either:
# - idle between turns (no current_tool) — probably stuck on a slow API call
# - inside a tool (current_tool set) — probably running a legitimately long
# operation (terminal command, web fetch, large file read)
# The idle ceiling stays tight so genuinely stuck children don't mask the gateway
# timeout. The in-tool ceiling is much higher so legit long-running tools get
# time to finish; child_timeout_seconds (default 600s) is still the hard cap.
_HEARTBEAT_STALE_CYCLES_IDLE = 5 # 5 * 30s = 150s idle between turns → stale
_HEARTBEAT_STALE_CYCLES_IN_TOOL = 20 # 20 * 30s = 600s stuck on same tool → stale
DEFAULT_TOOLSETS = ["terminal", "file", "web"]
@ -1037,6 +1043,150 @@ def _build_child_agent(
return child
def _dump_subagent_timeout_diagnostic(
*,
child: Any,
task_index: int,
timeout_seconds: float,
duration_seconds: float,
worker_thread: Optional[threading.Thread],
goal: str,
) -> Optional[str]:
"""Write a structured diagnostic dump for a subagent that timed out
before making any API call.
See issue #14726: users hit "subagent timed out after 300s with no response"
with zero API calls and no way to inspect what happened. This helper
writes a dedicated log under ``~/.hermes/logs/subagent-<sid>-<ts>.log``
capturing the child's config, system-prompt / tool-schema sizes, activity
tracker snapshot, and the worker thread's Python stack at timeout.
Returns the absolute path to the diagnostic file, or None on failure.
"""
try:
from hermes_constants import get_hermes_home
import datetime as _dt
import sys as _sys
import traceback as _traceback
hermes_home = get_hermes_home()
logs_dir = hermes_home / "logs"
try:
logs_dir.mkdir(parents=True, exist_ok=True)
except Exception:
return None
subagent_id = getattr(child, "_subagent_id", None) or f"idx{task_index}"
ts = _dt.datetime.now().strftime("%Y%m%d_%H%M%S")
dump_path = logs_dir / f"subagent-timeout-{subagent_id}-{ts}.log"
lines: List[str] = []
def _w(line: str = "") -> None:
lines.append(line)
_w(f"# Subagent timeout diagnostic — issue #14726")
_w(f"# Generated: {_dt.datetime.now().isoformat()}")
_w("")
_w("## Timeout")
_w(f" task_index: {task_index}")
_w(f" subagent_id: {subagent_id}")
_w(f" configured_timeout: {timeout_seconds}s")
_w(f" actual_duration: {duration_seconds:.2f}s")
_w("")
_w("## Goal")
_goal_preview = (goal or "").strip()
if len(_goal_preview) > 1000:
_goal_preview = _goal_preview[:1000] + " ...[truncated]"
_w(_goal_preview or "(empty)")
_w("")
_w("## Child config")
for attr in (
"model", "provider", "api_mode", "base_url", "max_iterations",
"quiet_mode", "skip_memory", "skip_context_files", "platform",
"_delegate_role", "_delegate_depth",
):
try:
val = getattr(child, attr, None)
# Redact api_key-shaped values defensively
if isinstance(val, str) and attr == "base_url":
pass
_w(f" {attr}: {val!r}")
except Exception:
_w(f" {attr}: <unreadable>")
_w("")
_w("## Toolsets")
enabled = getattr(child, "enabled_toolsets", None)
_w(f" enabled_toolsets: {enabled!r}")
tool_names = getattr(child, "valid_tool_names", None)
if tool_names:
_w(f" loaded tool count: {len(tool_names)}")
try:
_w(f" loaded tools: {sorted(list(tool_names))}")
except Exception:
pass
_w("")
_w("## Prompt / schema sizes")
try:
sys_prompt = getattr(child, "ephemeral_system_prompt", None) \
or getattr(child, "system_prompt", None) \
or ""
_w(f" system_prompt_bytes: {len(sys_prompt.encode('utf-8')) if isinstance(sys_prompt, str) else 'n/a'}")
_w(f" system_prompt_chars: {len(sys_prompt) if isinstance(sys_prompt, str) else 'n/a'}")
except Exception as exc:
_w(f" system_prompt: <error: {exc}>")
try:
tools_schema = getattr(child, "tools", None)
if tools_schema is not None:
_schema_json = json.dumps(tools_schema, default=str)
_w(f" tool_schema_count: {len(tools_schema)}")
_w(f" tool_schema_bytes: {len(_schema_json.encode('utf-8'))}")
except Exception as exc:
_w(f" tool_schema: <error: {exc}>")
_w("")
_w("## Activity summary")
try:
summary = child.get_activity_summary()
for k, v in summary.items():
_w(f" {k}: {v!r}")
except Exception as exc:
_w(f" <get_activity_summary failed: {exc}>")
_w("")
_w("## Worker thread stack at timeout")
if worker_thread is not None and worker_thread.is_alive():
frames = _sys._current_frames()
worker_frame = frames.get(worker_thread.ident)
if worker_frame is not None:
stack = _traceback.format_stack(worker_frame)
for frame_line in stack:
for sub in frame_line.rstrip().split("\n"):
_w(f" {sub}")
else:
_w(" <worker frame not available>")
elif worker_thread is None:
_w(" <no worker thread handle>")
else:
_w(" <worker thread already exited>")
_w("")
_w("## Notes")
_w(" This file is written ONLY when a subagent times out with 0 API calls.")
_w(" 0-API-call timeouts mean the child never reached its first LLM request.")
_w(" Common causes: oversized prompt rejected by provider, transport hang,")
_w(" credential resolution stuck. See issue #14726 for context.")
dump_path.write_text("\n".join(lines), encoding="utf-8")
return str(dump_path)
except Exception as exc:
logger.warning("Subagent timeout diagnostic dump failed: %s", exc)
return None
def _run_single_child(
task_index: int,
goal: str,
@ -1078,7 +1228,11 @@ def _run_single_child(
# Without this, the parent's _last_activity_ts freezes when delegate_task
# starts and the gateway eventually kills the agent for "no activity".
_heartbeat_stop = threading.Event()
_last_seen_iter = [0] # mutable container for heartbeat stale detection
# Stale detection: track the child's (tool, iteration) pair across
# heartbeat cycles. If neither advances, count the cycle as stale.
# Different thresholds for idle vs in-tool (see _HEARTBEAT_STALE_CYCLES_*).
_last_seen_iter = [0]
_last_seen_tool = [None] # type: list
_stale_count = [0]
def _heartbeat_loop():
@ -1096,22 +1250,38 @@ def _run_single_child(
child_iter = child_summary.get("api_call_count", 0)
child_max = child_summary.get("max_iterations", 0)
# Stale detection: if iteration count hasn't advanced,
# increment stale counter. After N cycles with no
# progress, stop masking the hang so the gateway
# inactivity timeout can fire as a last resort.
if child_iter <= _last_seen_iter[0]:
_stale_count[0] += 1
else:
# Stale detection: count cycles where neither the iteration
# count nor the current_tool advances. A child running a
# legitimately long-running tool (terminal command, web
# fetch) keeps current_tool set but doesn't advance
# api_call_count — we don't want that to look stale at the
# idle threshold.
iter_advanced = child_iter > _last_seen_iter[0]
tool_changed = child_tool != _last_seen_tool[0]
if iter_advanced or tool_changed:
_last_seen_iter[0] = child_iter
_last_seen_tool[0] = child_tool
_stale_count[0] = 0
else:
_stale_count[0] += 1
if _stale_count[0] >= _HEARTBEAT_STALE_CYCLES:
# Pick threshold based on whether the child is currently
# inside a tool call. In-tool threshold is high enough to
# cover legitimately slow tools; idle threshold stays
# tight so the gateway timeout can fire on a truly wedged
# child.
stale_limit = (
_HEARTBEAT_STALE_CYCLES_IN_TOOL
if child_tool
else _HEARTBEAT_STALE_CYCLES_IDLE
)
if _stale_count[0] >= stale_limit:
logger.warning(
"Subagent %d appears stale (no iteration progress "
"for %d heartbeat cycles) — stopping heartbeat",
"Subagent %d appears stale (no progress for %d "
"heartbeat cycles, tool=%s) — stopping heartbeat",
task_index,
_stale_count[0],
child_tool or "<none>",
)
break # stop touching parent, let gateway timeout fire
@ -1193,11 +1363,18 @@ def _run_single_child(
initializer=_set_subagent_approval_cb,
initargs=(_subagent_auto_approve,),
)
_child_future = _timeout_executor.submit(
child.run_conversation,
user_message=goal,
task_id=child_task_id,
)
# Capture the worker thread so the timeout diagnostic can dump its
# Python stack (see #14726 - 0-API-call hangs are opaque without it).
_worker_thread_holder: Dict[str, Optional[threading.Thread]] = {"t": None}
def _run_with_thread_capture():
_worker_thread_holder["t"] = threading.current_thread()
return child.run_conversation(
user_message=goal,
task_id=child_task_id,
)
_child_future = _timeout_executor.submit(_run_with_thread_capture)
try:
result = _child_future.result(timeout=child_timeout)
except Exception as _timeout_exc:
@ -1219,6 +1396,32 @@ def _run_single_child(
duration,
)
# When a subagent times out BEFORE making any API call, dump a
# diagnostic to help users (and us) see what the child was doing.
# See #14726 — without this, 0-API-call hangs are black boxes.
diagnostic_path: Optional[str] = None
child_api_calls = 0
try:
_summary = child.get_activity_summary()
child_api_calls = int(_summary.get("api_call_count", 0) or 0)
except Exception:
pass
if is_timeout and child_api_calls == 0:
diagnostic_path = _dump_subagent_timeout_diagnostic(
child=child,
task_index=task_index,
timeout_seconds=float(child_timeout),
duration_seconds=float(duration),
worker_thread=_worker_thread_holder.get("t"),
goal=goal,
)
if diagnostic_path:
logger.warning(
"Subagent %d 0-API-call timeout — diagnostic written to %s",
task_index,
diagnostic_path,
)
if child_progress_cb:
try:
child_progress_cb(
@ -1235,22 +1438,35 @@ def _run_single_child(
except Exception:
pass
if is_timeout:
if child_api_calls == 0:
_err = (
f"Subagent timed out after {child_timeout}s without "
f"making any API call — the child never reached its "
f"first LLM request (prompt construction, credential "
f"resolution, or transport may be stuck)."
)
if diagnostic_path:
_err += f" Diagnostic: {diagnostic_path}"
else:
_err = (
f"Subagent timed out after {child_timeout}s with "
f"{child_api_calls} API call(s) completed — likely "
f"stuck on a slow API call or unresponsive network request."
)
else:
_err = str(_timeout_exc)
return {
"task_index": task_index,
"status": "timeout" if is_timeout else "error",
"summary": None,
"error": (
(
f"Subagent timed out after {child_timeout}s with no response. "
"The child may be stuck on a slow API call or unresponsive network request."
)
if is_timeout
else str(_timeout_exc)
),
"error": _err,
"exit_reason": "timeout" if is_timeout else "error",
"api_calls": 0,
"api_calls": child_api_calls,
"duration_seconds": duration,
"_child_role": getattr(child, "_delegate_role", None),
"diagnostic_path": diagnostic_path,
}
finally:
# Shut down executor without waiting — if the child thread
@ -1583,7 +1799,18 @@ def delegate_task(
# Load config
cfg = _load_config()
default_max_iter = cfg.get("max_iterations", DEFAULT_MAX_ITERATIONS)
effective_max_iter = max_iterations or default_max_iter
# Model-supplied max_iterations is ignored — the config value is authoritative
# so users get predictable budgets. The kwarg is retained for internal callers
# and tests; a model-emitted value here would only shrink the budget and
# surprise the user mid-run. Log and drop it if one slips through from a
# cached tool schema or a stale provider.
if max_iterations is not None and max_iterations != default_max_iter:
logger.debug(
"delegate_task: ignoring caller-supplied max_iterations=%s; "
"using delegation.max_iterations=%s from config",
max_iterations, default_max_iter,
)
effective_max_iter = default_max_iter
# Resolve delegation credentials (provider:model pair).
# When delegation.provider is configured, this resolves the full credential
@ -2123,13 +2350,6 @@ DELEGATE_TASK_SCHEMA = {
"When provided, top-level goal/context/toolsets are ignored."
),
},
"max_iterations": {
"type": "integer",
"description": (
"Max tool-calling turns per subagent (default: 50). "
"Only set lower for simple tasks."
),
},
"role": {
"type": "string",
"enum": ["leaf", "orchestrator"],