fix(delegation): remove the default subagent wall-clock timeout (#45149)

Subagents doing legitimate heavy work (deep code reviews, research fan-outs, slow reasoning models) were routinely killed at the blanket 600s child_timeout_seconds cap while making steady progress (e.g. 36 API calls completed when the axe fell). Failures should come from what the child is actually doing — API errors, tool errors, iteration budget — not a delegation-level stopwatch. - DEFAULT_CHILD_TIMEOUT: 600 -> None; Future.result(timeout=None) blocks until the child finishes - config default delegation.child_timeout_seconds: 600 -> 0 (0/negative = disabled; positive opts back in, floor 30s unchanged) - stuck-child protection unchanged: the heartbeat staleness monitor still stops refreshing parent activity so the gateway inactivity timeout fires on a truly wedged worker; the 0-API-call diagnostic dump still works when a cap is configured - docs updated (EN + zh-Hans)
2026-06-14 09:11:54 +00:00 · 2026-06-12 12:58:25 -07:00 · 2026-06-12 12:58:25 -07:00 · bba9b519aa
commit bba9b519aa
parent 9b01c4d193
4 changed files with 59 additions and 24 deletions
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -1760,10 +1760,11 @@ DEFAULT_CONFIG = {
        "inherit_mcp_toolsets": True,
        "max_iterations": 50,  # per-subagent iteration cap (each subagent gets its own budget,
                               # independent of the parent's max_iterations)
-        "child_timeout_seconds": 600,  # wall-clock timeout for each child agent (floor 30s,
-                                       # no ceiling). High-reasoning models on large tasks
-                                       # (e.g. gpt-5.5 xhigh, opus-4.6) need generous budgets;
-                                       # raise if children time out before producing output.
+        "child_timeout_seconds": 0,  # optional wall-clock cap per child agent. 0 (default)
+                                     # = no timeout: children fail only from real errors
+                                     # (API, tools, iteration budget), never a delegation
+                                     # stopwatch. Set a positive number of seconds
+                                     # (floor 30s) to enforce a hard cap.
        "reasoning_effort": "",  # reasoning effort for subagents: "xhigh", "high", "medium",
                                 # "low", "minimal", "none" (empty = inherit parent's level)
        "max_concurrent_children": 3,  # max parallel children per batch; floor of 1 enforced, no ceiling
--- a/tools/delegate_tool.py
+++ b/tools/delegate_tool.py
@ -397,31 +397,46 @@ def _get_max_concurrent_children() -> int:
    return _DEFAULT_MAX_CONCURRENT_CHILDREN


-def _get_child_timeout() -> float:
+def _get_child_timeout() -> Optional[float]:
    """Read delegation.child_timeout_seconds from config.

    Returns the number of seconds a single child agent is allowed to run
-    before being considered stuck.  Default: 600 s (10 minutes).
+    before being cut off, or ``None`` when no wall-clock cap applies.
+
+    Default: ``None`` (no timeout). Subagents doing legitimate heavy work
+    (deep code review, large research fan-outs, slow reasoning models) were
+    routinely killed mid-task by the old blanket cap even though they were
+    making steady progress. Failures should come from what the child is
+    actually doing — API errors, tool errors, iteration budget — not from a
+    generic delegation-level stopwatch. Stuck-child protection is handled
+    separately by the heartbeat staleness monitor, which stops refreshing
+    parent activity so the gateway inactivity timeout can fire.
+
+    Set ``delegation.child_timeout_seconds`` to a positive number to opt back
+    in to a hard cap (floor 30 s); ``0`` or a negative value means disabled.
    """
    cfg = _load_config()
    val = cfg.get("child_timeout_seconds")
    if val is not None:
        try:
-            return max(30.0, float(val))
+            parsed = float(val)
        except (TypeError, ValueError):
            logger.warning(
                "delegation.child_timeout_seconds=%r is not a valid number; "
-                "using default %d",
+                "using default (no timeout)",
                val,
-                DEFAULT_CHILD_TIMEOUT,
            )
+        else:
+            return None if parsed <= 0 else max(30.0, parsed)
    env_val = os.getenv("DELEGATION_CHILD_TIMEOUT_SECONDS")
    if env_val:
        try:
-            return max(30.0, float(env_val))
+            parsed = float(env_val)
        except (TypeError, ValueError):
            pass
-    return float(DEFAULT_CHILD_TIMEOUT)
+        else:
+            return None if parsed <= 0 else max(30.0, parsed)
+    return DEFAULT_CHILD_TIMEOUT


 def _get_max_spawn_depth() -> int:
@ -544,7 +559,12 @@ def _preserve_parent_mcp_toolsets(


 DEFAULT_MAX_ITERATIONS = 50
-DEFAULT_CHILD_TIMEOUT = 600  # seconds before a child agent is considered stuck
+# No default wall-clock cap on child agents: legitimate heavy subagent work
+# (deep reviews, research fan-outs, slow reasoning models) was being killed
+# mid-task. Errors should come from what the child actually does; stuck-child
+# detection lives in the heartbeat staleness monitor below. Users can opt back
+# in via delegation.child_timeout_seconds.
+DEFAULT_CHILD_TIMEOUT: Optional[float] = None
 _HEARTBEAT_INTERVAL = 30  # seconds between parent activity heartbeats during delegation
 # Stale-heartbeat thresholds. A child with no API-call progress is either:
 #   - idle between turns (no current_tool) — probably stuck on a slow API call
@ -552,7 +572,8 @@ _HEARTBEAT_INTERVAL = 30  # seconds between parent activity heartbeats during de
 #     operation (terminal command, web fetch, large file read)
 # The idle ceiling stays tight so genuinely stuck children don't mask the gateway
 # timeout. The in-tool ceiling is much higher so legit long-running tools get
-# time to finish; child_timeout_seconds (default 600s) is still the hard cap.
+# time to finish; delegation.child_timeout_seconds (off by default) remains an
+# optional hard cap for users who want one.
 _HEARTBEAT_STALE_CYCLES_IDLE = 15  # 15 * 30s = 450s idle between turns → stale
 _HEARTBEAT_STALE_CYCLES_IN_TOOL = 40  # 40 * 30s = 1200s stuck on same tool → stale
 DEFAULT_TOOLSETS = ["terminal", "file", "web"]
@ -1556,8 +1577,9 @@ def _run_single_child(
            list(file_state.known_reads(parent_task_id)) if parent_task_id else []
        )

-        # Run child with a hard timeout to prevent indefinite blocking
-        # when the child's API call or tool-level HTTP request hangs.
+        # Run child with an optional hard timeout (off by default —
+        # result(timeout=None) blocks until the child finishes). Stuck-child
+        # protection comes from the heartbeat staleness monitor instead.
        child_timeout = _get_child_timeout()
        _timeout_executor = ThreadPoolExecutor(
            max_workers=1,
@ -1615,7 +1637,9 @@ def _run_single_child(
                diagnostic_path = _dump_subagent_timeout_diagnostic(
                    child=child,
                    task_index=task_index,
-                    timeout_seconds=float(child_timeout),
+                    # is_timeout implies a cap was configured (result(timeout=None)
+                    # never raises FuturesTimeoutError); guard for the type checker.
+                    timeout_seconds=float(child_timeout or 0.0),
                    duration_seconds=float(duration),
                    worker_thread=_worker_thread_holder.get("t"),
                    goal=goal,
--- a/website/docs/user-guide/features/delegation.md
+++ b/website/docs/user-guide/features/delegation.md
@ -175,17 +175,22 @@ delegate_task(

 ## Child Timeout

-Subagents are killed as stuck if they go quiet for more than `delegation.child_timeout_seconds` wall-clock seconds. The default is **600** (10 minutes) — bumped up from 300s in earlier releases because high-reasoning models on non-trivial research tasks were getting killed mid-think. Tune it per-install:
+By default there is **no wall-clock timeout** on subagents. Children fail only from what they're actually doing — API errors, tool errors, or hitting their iteration budget — never from a delegation-level stopwatch. Earlier releases shipped a hard cap (300s, later 600s), which kept killing legitimately busy children mid-task: deep code reviews, large research fan-outs, and slow reasoning models routinely need more than 10 minutes while making steady progress the whole time.
+
+Genuinely stuck children are still detected: the heartbeat staleness monitor stops refreshing the parent's activity when a child makes no progress (no API calls, no tool starts), letting the gateway inactivity timeout fire on a truly wedged worker.
+
+If you want a hard cap anyway (e.g. cost control on unattended cron-driven delegation), opt in per-install:

 ```yaml
 delegation:
-  child_timeout_seconds: 600   # default
+  child_timeout_seconds: 0     # default: 0 = no timeout
+  # child_timeout_seconds: 1800  # opt-in hard cap (floor 30s)
 ```

-Lower it for fast local models; raise it for slow reasoning models on hard problems. The timer resets every time the child makes an API call or tool call — only genuinely idle workers trigger the kill.
+A positive value enforces a hard wall-clock limit on each child; `0` or a negative value disables it.

 :::tip Diagnostic dump on zero-call timeout
-If a subagent times out having made **zero** API calls (usually: provider unreachable, auth failure, or tool-schema rejection), `delegate_task` writes a structured diagnostic to `~/.hermes/logs/subagent-timeout-<session>-<timestamp>.log` containing the subagent's config snapshot, credential-resolution trace, and any early error messages. Much easier to root-cause than the previous silent-timeout behavior.
+With a hard cap configured, if a subagent times out having made **zero** API calls (usually: provider unreachable, auth failure, or tool-schema rejection), `delegate_task` writes a structured diagnostic to `~/.hermes/logs/subagent-timeout-<session>-<timestamp>.log` containing the subagent's config snapshot, credential-resolution trace, and any early error messages. Much easier to root-cause than the previous silent-timeout behavior.
 :::

 ## Monitoring Running Subagents (`/agents`)
--- a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/features/delegation.md
+++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/features/delegation.md
@ -175,17 +175,22 @@ delegate_task(

 ## 子智能体超时

-如果子智能体静默超过 `delegation.child_timeout_seconds` 秒（挂钟时间），则会被判定为卡死并终止。默认值为 **600**（10 分钟）——相比早期版本的 300 秒有所提升，因为高推理能力模型在处理非平凡研究任务时会在推理中途被终止。可按安装实例调整：
+默认情况下，子智能体**没有挂钟超时限制**。子智能体只会因其实际执行的操作而失败——API 错误、工具错误或达到迭代预算上限——而不会被委派层面的计时器终止。早期版本曾设有硬性上限（300 秒，后为 600 秒），但这会在任务执行过程中误杀正常工作的子智能体：深度代码审查、大规模研究分发以及慢速推理模型经常需要超过 10 分钟，而它们全程都在稳定推进。
+
+真正卡死的子智能体仍会被检测到：当子智能体没有任何进展（无 API 调用、无工具启动）时，心跳陈旧度监控会停止刷新父智能体的活动状态，从而让网关的不活动超时机制对真正卡死的工作进程生效。
+
+如果仍需要硬性上限（例如对无人值守的 cron 驱动委派进行成本控制），可按安装实例选择启用：

 ```yaml
 delegation:
-  child_timeout_seconds: 600   # default
+  child_timeout_seconds: 0     # 默认：0 = 无超时
+  # child_timeout_seconds: 1800  # 选择启用的硬性上限（下限 30 秒）
 ```

-对于快速本地模型可降低此值；对于处理难题的慢速推理模型可提高此值。计时器在子智能体每次发起 API 调用或工具调用时重置——只有真正空闲的工作线程才会触发终止。
+正值会对每个子智能体强制执行挂钟时间硬限制；`0` 或负值表示禁用。

 :::tip 零调用超时时的诊断转储
-如果子智能体在**零次** API 调用的情况下超时（通常原因：provider 不可达、认证失败或工具 schema 被拒绝），`delegate_task` 会将结构化诊断信息写入 `~/.hermes/logs/subagent-timeout-<session>-<timestamp>.log`，其中包含子智能体的配置快照、凭据解析追踪以及早期错误消息。比之前的静默超时行为更易于定位根因。
+在配置了硬性上限的情况下，如果子智能体在**零次** API 调用的情况下超时（通常原因：provider 不可达、认证失败或工具 schema 被拒绝），`delegate_task` 会将结构化诊断信息写入 `~/.hermes/logs/subagent-timeout-<session>-<timestamp>.log`，其中包含子智能体的配置快照、凭据解析追踪以及早期错误消息。比之前的静默超时行为更易于定位根因。
 :::

 ## 监控运行中的子智能体（`/agents`）