fix(delegate): resolve merge conflict with upstream thread capture

2026-04-29 01:31:41 +00:00 · 2026-04-25 03:13:07 -07:00 · 2026-04-25 03:13:07 -07:00 · fda1325e79
commit fda1325e79
parent b172a2ea40 4fade39c90
522 changed files with 82059 additions and 4541 deletions
--- a/tools/delegate_tool.py
+++ b/tools/delegate_tool.py
@ -319,7 +319,7 @@ def _get_child_timeout() -> float:
    """Read delegation.child_timeout_seconds from config.

    Returns the number of seconds a single child agent is allowed to run
-    before being considered stuck.  Default: 300 s (5 minutes).
+    before being considered stuck.  Default: 600 s (10 minutes).
    """
    cfg = _load_config()
    val = cfg.get("child_timeout_seconds")
@ -430,11 +430,17 @@ def _preserve_parent_mcp_toolsets(


 DEFAULT_MAX_ITERATIONS = 50
-DEFAULT_CHILD_TIMEOUT = 300  # seconds before a child agent is considered stuck
+DEFAULT_CHILD_TIMEOUT = 600  # seconds before a child agent is considered stuck
 _HEARTBEAT_INTERVAL = 30  # seconds between parent activity heartbeats during delegation
-_HEARTBEAT_STALE_CYCLES = (
-    5  # mark child stale after this many heartbeats with no iteration progress
-)
+# Stale-heartbeat thresholds. A child with no API-call progress is either:
+#   - idle between turns (no current_tool) — probably stuck on a slow API call
+#   - inside a tool (current_tool set) — probably running a legitimately long
+#     operation (terminal command, web fetch, large file read)
+# The idle ceiling stays tight so genuinely stuck children don't mask the gateway
+# timeout. The in-tool ceiling is much higher so legit long-running tools get
+# time to finish; child_timeout_seconds (default 600s) is still the hard cap.
+_HEARTBEAT_STALE_CYCLES_IDLE = 5  # 5 * 30s = 150s idle between turns → stale
+_HEARTBEAT_STALE_CYCLES_IN_TOOL = 20  # 20 * 30s = 600s stuck on same tool → stale
 DEFAULT_TOOLSETS = ["terminal", "file", "web"]


@ -1037,6 +1043,150 @@ def _build_child_agent(
    return child


+def _dump_subagent_timeout_diagnostic(
+    *,
+    child: Any,
+    task_index: int,
+    timeout_seconds: float,
+    duration_seconds: float,
+    worker_thread: Optional[threading.Thread],
+    goal: str,
+) -> Optional[str]:
+    """Write a structured diagnostic dump for a subagent that timed out
+    before making any API call.
+
+    See issue #14726: users hit "subagent timed out after 300s with no response"
+    with zero API calls and no way to inspect what happened. This helper
+    writes a dedicated log under ``~/.hermes/logs/subagent-<sid>-<ts>.log``
+    capturing the child's config, system-prompt / tool-schema sizes, activity
+    tracker snapshot, and the worker thread's Python stack at timeout.
+
+    Returns the absolute path to the diagnostic file, or None on failure.
+    """
+    try:
+        from hermes_constants import get_hermes_home
+        import datetime as _dt
+        import sys as _sys
+        import traceback as _traceback
+
+        hermes_home = get_hermes_home()
+        logs_dir = hermes_home / "logs"
+        try:
+            logs_dir.mkdir(parents=True, exist_ok=True)
+        except Exception:
+            return None
+
+        subagent_id = getattr(child, "_subagent_id", None) or f"idx{task_index}"
+        ts = _dt.datetime.now().strftime("%Y%m%d_%H%M%S")
+        dump_path = logs_dir / f"subagent-timeout-{subagent_id}-{ts}.log"
+
+        lines: List[str] = []
+        def _w(line: str = "") -> None:
+            lines.append(line)
+
+        _w(f"# Subagent timeout diagnostic — issue #14726")
+        _w(f"# Generated: {_dt.datetime.now().isoformat()}")
+        _w("")
+        _w("## Timeout")
+        _w(f"  task_index:        {task_index}")
+        _w(f"  subagent_id:       {subagent_id}")
+        _w(f"  configured_timeout: {timeout_seconds}s")
+        _w(f"  actual_duration:   {duration_seconds:.2f}s")
+        _w("")
+
+        _w("## Goal")
+        _goal_preview = (goal or "").strip()
+        if len(_goal_preview) > 1000:
+            _goal_preview = _goal_preview[:1000] + " ...[truncated]"
+        _w(_goal_preview or "(empty)")
+        _w("")
+
+        _w("## Child config")
+        for attr in (
+            "model", "provider", "api_mode", "base_url", "max_iterations",
+            "quiet_mode", "skip_memory", "skip_context_files", "platform",
+            "_delegate_role", "_delegate_depth",
+        ):
+            try:
+                val = getattr(child, attr, None)
+                # Redact api_key-shaped values defensively
+                if isinstance(val, str) and attr == "base_url":
+                    pass
+                _w(f"  {attr}: {val!r}")
+            except Exception:
+                _w(f"  {attr}: <unreadable>")
+        _w("")
+
+        _w("## Toolsets")
+        enabled = getattr(child, "enabled_toolsets", None)
+        _w(f"  enabled_toolsets:  {enabled!r}")
+        tool_names = getattr(child, "valid_tool_names", None)
+        if tool_names:
+            _w(f"  loaded tool count: {len(tool_names)}")
+            try:
+                _w(f"  loaded tools:      {sorted(list(tool_names))}")
+            except Exception:
+                pass
+        _w("")
+
+        _w("## Prompt / schema sizes")
+        try:
+            sys_prompt = getattr(child, "ephemeral_system_prompt", None) \
+                or getattr(child, "system_prompt", None) \
+                or ""
+            _w(f"  system_prompt_bytes: {len(sys_prompt.encode('utf-8')) if isinstance(sys_prompt, str) else 'n/a'}")
+            _w(f"  system_prompt_chars: {len(sys_prompt) if isinstance(sys_prompt, str) else 'n/a'}")
+        except Exception as exc:
+            _w(f"  system_prompt: <error: {exc}>")
+        try:
+            tools_schema = getattr(child, "tools", None)
+            if tools_schema is not None:
+                _schema_json = json.dumps(tools_schema, default=str)
+                _w(f"  tool_schema_count: {len(tools_schema)}")
+                _w(f"  tool_schema_bytes: {len(_schema_json.encode('utf-8'))}")
+        except Exception as exc:
+            _w(f"  tool_schema: <error: {exc}>")
+        _w("")
+
+        _w("## Activity summary")
+        try:
+            summary = child.get_activity_summary()
+            for k, v in summary.items():
+                _w(f"  {k}: {v!r}")
+        except Exception as exc:
+            _w(f"  <get_activity_summary failed: {exc}>")
+        _w("")
+
+        _w("## Worker thread stack at timeout")
+        if worker_thread is not None and worker_thread.is_alive():
+            frames = _sys._current_frames()
+            worker_frame = frames.get(worker_thread.ident)
+            if worker_frame is not None:
+                stack = _traceback.format_stack(worker_frame)
+                for frame_line in stack:
+                    for sub in frame_line.rstrip().split("\n"):
+                        _w(f"  {sub}")
+            else:
+                _w("  <worker frame not available>")
+        elif worker_thread is None:
+            _w("  <no worker thread handle>")
+        else:
+            _w("  <worker thread already exited>")
+        _w("")
+
+        _w("## Notes")
+        _w("  This file is written ONLY when a subagent times out with 0 API calls.")
+        _w("  0-API-call timeouts mean the child never reached its first LLM request.")
+        _w("  Common causes: oversized prompt rejected by provider, transport hang,")
+        _w("  credential resolution stuck. See issue #14726 for context.")
+
+        dump_path.write_text("\n".join(lines), encoding="utf-8")
+        return str(dump_path)
+    except Exception as exc:
+        logger.warning("Subagent timeout diagnostic dump failed: %s", exc)
+        return None
+
+
 def _run_single_child(
    task_index: int,
    goal: str,
@ -1078,7 +1228,11 @@ def _run_single_child(
    # Without this, the parent's _last_activity_ts freezes when delegate_task
    # starts and the gateway eventually kills the agent for "no activity".
    _heartbeat_stop = threading.Event()
-    _last_seen_iter = [0]  # mutable container for heartbeat stale detection
+    # Stale detection: track the child's (tool, iteration) pair across
+    # heartbeat cycles. If neither advances, count the cycle as stale.
+    # Different thresholds for idle vs in-tool (see _HEARTBEAT_STALE_CYCLES_*).
+    _last_seen_iter = [0]
+    _last_seen_tool = [None]  # type: list
    _stale_count = [0]

    def _heartbeat_loop():
@ -1096,22 +1250,38 @@ def _run_single_child(
                child_iter = child_summary.get("api_call_count", 0)
                child_max = child_summary.get("max_iterations", 0)

-                # Stale detection: if iteration count hasn't advanced,
-                # increment stale counter.  After N cycles with no
-                # progress, stop masking the hang so the gateway
-                # inactivity timeout can fire as a last resort.
-                if child_iter <= _last_seen_iter[0]:
-                    _stale_count[0] += 1
-                else:
+                # Stale detection: count cycles where neither the iteration
+                # count nor the current_tool advances. A child running a
+                # legitimately long-running tool (terminal command, web
+                # fetch) keeps current_tool set but doesn't advance
+                # api_call_count — we don't want that to look stale at the
+                # idle threshold.
+                iter_advanced = child_iter > _last_seen_iter[0]
+                tool_changed = child_tool != _last_seen_tool[0]
+                if iter_advanced or tool_changed:
                    _last_seen_iter[0] = child_iter
+                    _last_seen_tool[0] = child_tool
                    _stale_count[0] = 0
+                else:
+                    _stale_count[0] += 1

-                if _stale_count[0] >= _HEARTBEAT_STALE_CYCLES:
+                # Pick threshold based on whether the child is currently
+                # inside a tool call. In-tool threshold is high enough to
+                # cover legitimately slow tools; idle threshold stays
+                # tight so the gateway timeout can fire on a truly wedged
+                # child.
+                stale_limit = (
+                    _HEARTBEAT_STALE_CYCLES_IN_TOOL
+                    if child_tool
+                    else _HEARTBEAT_STALE_CYCLES_IDLE
+                )
+                if _stale_count[0] >= stale_limit:
                    logger.warning(
-                        "Subagent %d appears stale (no iteration progress "
-                        "for %d heartbeat cycles) — stopping heartbeat",
+                        "Subagent %d appears stale (no progress for %d "
+                        "heartbeat cycles, tool=%s) — stopping heartbeat",
                        task_index,
                        _stale_count[0],
+                        child_tool or "<none>",
                    )
                    break  # stop touching parent, let gateway timeout fire

@ -1193,11 +1363,18 @@ def _run_single_child(
            initializer=_set_subagent_approval_cb,
            initargs=(_subagent_auto_approve,),
        )
-        _child_future = _timeout_executor.submit(
-            child.run_conversation,
-            user_message=goal,
-            task_id=child_task_id,
-        )
+        # Capture the worker thread so the timeout diagnostic can dump its
+        # Python stack (see #14726 - 0-API-call hangs are opaque without it).
+        _worker_thread_holder: Dict[str, Optional[threading.Thread]] = {"t": None}
+
+        def _run_with_thread_capture():
+            _worker_thread_holder["t"] = threading.current_thread()
+            return child.run_conversation(
+                user_message=goal,
+                task_id=child_task_id,
+            )
+
+        _child_future = _timeout_executor.submit(_run_with_thread_capture)
        try:
            result = _child_future.result(timeout=child_timeout)
        except Exception as _timeout_exc:
@ -1219,6 +1396,32 @@ def _run_single_child(
                duration,
            )

+            # When a subagent times out BEFORE making any API call, dump a
+            # diagnostic to help users (and us) see what the child was doing.
+            # See #14726 — without this, 0-API-call hangs are black boxes.
+            diagnostic_path: Optional[str] = None
+            child_api_calls = 0
+            try:
+                _summary = child.get_activity_summary()
+                child_api_calls = int(_summary.get("api_call_count", 0) or 0)
+            except Exception:
+                pass
+            if is_timeout and child_api_calls == 0:
+                diagnostic_path = _dump_subagent_timeout_diagnostic(
+                    child=child,
+                    task_index=task_index,
+                    timeout_seconds=float(child_timeout),
+                    duration_seconds=float(duration),
+                    worker_thread=_worker_thread_holder.get("t"),
+                    goal=goal,
+                )
+                if diagnostic_path:
+                    logger.warning(
+                        "Subagent %d 0-API-call timeout — diagnostic written to %s",
+                        task_index,
+                        diagnostic_path,
+                    )
+
            if child_progress_cb:
                try:
                    child_progress_cb(
@ -1235,22 +1438,35 @@ def _run_single_child(
                except Exception:
                    pass

+            if is_timeout:
+                if child_api_calls == 0:
+                    _err = (
+                        f"Subagent timed out after {child_timeout}s without "
+                        f"making any API call — the child never reached its "
+                        f"first LLM request (prompt construction, credential "
+                        f"resolution, or transport may be stuck)."
+                    )
+                    if diagnostic_path:
+                        _err += f" Diagnostic: {diagnostic_path}"
+                else:
+                    _err = (
+                        f"Subagent timed out after {child_timeout}s with "
+                        f"{child_api_calls} API call(s) completed — likely "
+                        f"stuck on a slow API call or unresponsive network request."
+                    )
+            else:
+                _err = str(_timeout_exc)
+
            return {
                "task_index": task_index,
                "status": "timeout" if is_timeout else "error",
                "summary": None,
-                "error": (
-                    (
-                        f"Subagent timed out after {child_timeout}s with no response. "
-                        "The child may be stuck on a slow API call or unresponsive network request."
-                    )
-                    if is_timeout
-                    else str(_timeout_exc)
-                ),
+                "error": _err,
                "exit_reason": "timeout" if is_timeout else "error",
-                "api_calls": 0,
+                "api_calls": child_api_calls,
                "duration_seconds": duration,
                "_child_role": getattr(child, "_delegate_role", None),
+                "diagnostic_path": diagnostic_path,
            }
        finally:
            # Shut down executor without waiting — if the child thread
@ -1583,7 +1799,18 @@ def delegate_task(
    # Load config
    cfg = _load_config()
    default_max_iter = cfg.get("max_iterations", DEFAULT_MAX_ITERATIONS)
-    effective_max_iter = max_iterations or default_max_iter
+    # Model-supplied max_iterations is ignored — the config value is authoritative
+    # so users get predictable budgets. The kwarg is retained for internal callers
+    # and tests; a model-emitted value here would only shrink the budget and
+    # surprise the user mid-run. Log and drop it if one slips through from a
+    # cached tool schema or a stale provider.
+    if max_iterations is not None and max_iterations != default_max_iter:
+        logger.debug(
+            "delegate_task: ignoring caller-supplied max_iterations=%s; "
+            "using delegation.max_iterations=%s from config",
+            max_iterations, default_max_iter,
+        )
+    effective_max_iter = default_max_iter

    # Resolve delegation credentials (provider:model pair).
    # When delegation.provider is configured, this resolves the full credential
@ -2123,13 +2350,6 @@ DELEGATE_TASK_SCHEMA = {
                    "When provided, top-level goal/context/toolsets are ignored."
                ),
            },
-            "max_iterations": {
-                "type": "integer",
-                "description": (
-                    "Max tool-calling turns per subagent (default: 50). "
-                    "Only set lower for simple tasks."
-                ),
-            },
            "role": {
                "type": "string",
                "enum": ["leaf", "orchestrator"],