diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 416ac415eb..5e6f75b8f5 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1760,10 +1760,11 @@ DEFAULT_CONFIG = { "inherit_mcp_toolsets": True, "max_iterations": 50, # per-subagent iteration cap (each subagent gets its own budget, # independent of the parent's max_iterations) - "child_timeout_seconds": 600, # wall-clock timeout for each child agent (floor 30s, - # no ceiling). High-reasoning models on large tasks - # (e.g. gpt-5.5 xhigh, opus-4.6) need generous budgets; - # raise if children time out before producing output. + "child_timeout_seconds": 0, # optional wall-clock cap per child agent. 0 (default) + # = no timeout: children fail only from real errors + # (API, tools, iteration budget), never a delegation + # stopwatch. Set a positive number of seconds + # (floor 30s) to enforce a hard cap. "reasoning_effort": "", # reasoning effort for subagents: "xhigh", "high", "medium", # "low", "minimal", "none" (empty = inherit parent's level) "max_concurrent_children": 3, # max parallel children per batch; floor of 1 enforced, no ceiling diff --git a/tools/delegate_tool.py b/tools/delegate_tool.py index 18dd176a13..fb17c537b9 100644 --- a/tools/delegate_tool.py +++ b/tools/delegate_tool.py @@ -397,31 +397,46 @@ def _get_max_concurrent_children() -> int: return _DEFAULT_MAX_CONCURRENT_CHILDREN -def _get_child_timeout() -> float: +def _get_child_timeout() -> Optional[float]: """Read delegation.child_timeout_seconds from config. Returns the number of seconds a single child agent is allowed to run - before being considered stuck. Default: 600 s (10 minutes). + before being cut off, or ``None`` when no wall-clock cap applies. + + Default: ``None`` (no timeout). Subagents doing legitimate heavy work + (deep code review, large research fan-outs, slow reasoning models) were + routinely killed mid-task by the old blanket cap even though they were + making steady progress. Failures should come from what the child is + actually doing — API errors, tool errors, iteration budget — not from a + generic delegation-level stopwatch. Stuck-child protection is handled + separately by the heartbeat staleness monitor, which stops refreshing + parent activity so the gateway inactivity timeout can fire. + + Set ``delegation.child_timeout_seconds`` to a positive number to opt back + in to a hard cap (floor 30 s); ``0`` or a negative value means disabled. """ cfg = _load_config() val = cfg.get("child_timeout_seconds") if val is not None: try: - return max(30.0, float(val)) + parsed = float(val) except (TypeError, ValueError): logger.warning( "delegation.child_timeout_seconds=%r is not a valid number; " - "using default %d", + "using default (no timeout)", val, - DEFAULT_CHILD_TIMEOUT, ) + else: + return None if parsed <= 0 else max(30.0, parsed) env_val = os.getenv("DELEGATION_CHILD_TIMEOUT_SECONDS") if env_val: try: - return max(30.0, float(env_val)) + parsed = float(env_val) except (TypeError, ValueError): pass - return float(DEFAULT_CHILD_TIMEOUT) + else: + return None if parsed <= 0 else max(30.0, parsed) + return DEFAULT_CHILD_TIMEOUT def _get_max_spawn_depth() -> int: @@ -544,7 +559,12 @@ def _preserve_parent_mcp_toolsets( DEFAULT_MAX_ITERATIONS = 50 -DEFAULT_CHILD_TIMEOUT = 600 # seconds before a child agent is considered stuck +# No default wall-clock cap on child agents: legitimate heavy subagent work +# (deep reviews, research fan-outs, slow reasoning models) was being killed +# mid-task. Errors should come from what the child actually does; stuck-child +# detection lives in the heartbeat staleness monitor below. Users can opt back +# in via delegation.child_timeout_seconds. +DEFAULT_CHILD_TIMEOUT: Optional[float] = None _HEARTBEAT_INTERVAL = 30 # seconds between parent activity heartbeats during delegation # Stale-heartbeat thresholds. A child with no API-call progress is either: # - idle between turns (no current_tool) — probably stuck on a slow API call @@ -552,7 +572,8 @@ _HEARTBEAT_INTERVAL = 30 # seconds between parent activity heartbeats during de # operation (terminal command, web fetch, large file read) # The idle ceiling stays tight so genuinely stuck children don't mask the gateway # timeout. The in-tool ceiling is much higher so legit long-running tools get -# time to finish; child_timeout_seconds (default 600s) is still the hard cap. +# time to finish; delegation.child_timeout_seconds (off by default) remains an +# optional hard cap for users who want one. _HEARTBEAT_STALE_CYCLES_IDLE = 15 # 15 * 30s = 450s idle between turns → stale _HEARTBEAT_STALE_CYCLES_IN_TOOL = 40 # 40 * 30s = 1200s stuck on same tool → stale DEFAULT_TOOLSETS = ["terminal", "file", "web"] @@ -1556,8 +1577,9 @@ def _run_single_child( list(file_state.known_reads(parent_task_id)) if parent_task_id else [] ) - # Run child with a hard timeout to prevent indefinite blocking - # when the child's API call or tool-level HTTP request hangs. + # Run child with an optional hard timeout (off by default — + # result(timeout=None) blocks until the child finishes). Stuck-child + # protection comes from the heartbeat staleness monitor instead. child_timeout = _get_child_timeout() _timeout_executor = ThreadPoolExecutor( max_workers=1, @@ -1615,7 +1637,9 @@ def _run_single_child( diagnostic_path = _dump_subagent_timeout_diagnostic( child=child, task_index=task_index, - timeout_seconds=float(child_timeout), + # is_timeout implies a cap was configured (result(timeout=None) + # never raises FuturesTimeoutError); guard for the type checker. + timeout_seconds=float(child_timeout or 0.0), duration_seconds=float(duration), worker_thread=_worker_thread_holder.get("t"), goal=goal, diff --git a/website/docs/user-guide/features/delegation.md b/website/docs/user-guide/features/delegation.md index 1d19c9fddc..b76a1df3d9 100644 --- a/website/docs/user-guide/features/delegation.md +++ b/website/docs/user-guide/features/delegation.md @@ -175,17 +175,22 @@ delegate_task( ## Child Timeout -Subagents are killed as stuck if they go quiet for more than `delegation.child_timeout_seconds` wall-clock seconds. The default is **600** (10 minutes) — bumped up from 300s in earlier releases because high-reasoning models on non-trivial research tasks were getting killed mid-think. Tune it per-install: +By default there is **no wall-clock timeout** on subagents. Children fail only from what they're actually doing — API errors, tool errors, or hitting their iteration budget — never from a delegation-level stopwatch. Earlier releases shipped a hard cap (300s, later 600s), which kept killing legitimately busy children mid-task: deep code reviews, large research fan-outs, and slow reasoning models routinely need more than 10 minutes while making steady progress the whole time. + +Genuinely stuck children are still detected: the heartbeat staleness monitor stops refreshing the parent's activity when a child makes no progress (no API calls, no tool starts), letting the gateway inactivity timeout fire on a truly wedged worker. + +If you want a hard cap anyway (e.g. cost control on unattended cron-driven delegation), opt in per-install: ```yaml delegation: - child_timeout_seconds: 600 # default + child_timeout_seconds: 0 # default: 0 = no timeout + # child_timeout_seconds: 1800 # opt-in hard cap (floor 30s) ``` -Lower it for fast local models; raise it for slow reasoning models on hard problems. The timer resets every time the child makes an API call or tool call — only genuinely idle workers trigger the kill. +A positive value enforces a hard wall-clock limit on each child; `0` or a negative value disables it. :::tip Diagnostic dump on zero-call timeout -If a subagent times out having made **zero** API calls (usually: provider unreachable, auth failure, or tool-schema rejection), `delegate_task` writes a structured diagnostic to `~/.hermes/logs/subagent-timeout--.log` containing the subagent's config snapshot, credential-resolution trace, and any early error messages. Much easier to root-cause than the previous silent-timeout behavior. +With a hard cap configured, if a subagent times out having made **zero** API calls (usually: provider unreachable, auth failure, or tool-schema rejection), `delegate_task` writes a structured diagnostic to `~/.hermes/logs/subagent-timeout--.log` containing the subagent's config snapshot, credential-resolution trace, and any early error messages. Much easier to root-cause than the previous silent-timeout behavior. ::: ## Monitoring Running Subagents (`/agents`) diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/features/delegation.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/features/delegation.md index 9b9af8352d..6458a9ec71 100644 --- a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/features/delegation.md +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/features/delegation.md @@ -175,17 +175,22 @@ delegate_task( ## 子智能体超时 -如果子智能体静默超过 `delegation.child_timeout_seconds` 秒(挂钟时间),则会被判定为卡死并终止。默认值为 **600**(10 分钟)——相比早期版本的 300 秒有所提升,因为高推理能力模型在处理非平凡研究任务时会在推理中途被终止。可按安装实例调整: +默认情况下,子智能体**没有挂钟超时限制**。子智能体只会因其实际执行的操作而失败——API 错误、工具错误或达到迭代预算上限——而不会被委派层面的计时器终止。早期版本曾设有硬性上限(300 秒,后为 600 秒),但这会在任务执行过程中误杀正常工作的子智能体:深度代码审查、大规模研究分发以及慢速推理模型经常需要超过 10 分钟,而它们全程都在稳定推进。 + +真正卡死的子智能体仍会被检测到:当子智能体没有任何进展(无 API 调用、无工具启动)时,心跳陈旧度监控会停止刷新父智能体的活动状态,从而让网关的不活动超时机制对真正卡死的工作进程生效。 + +如果仍需要硬性上限(例如对无人值守的 cron 驱动委派进行成本控制),可按安装实例选择启用: ```yaml delegation: - child_timeout_seconds: 600 # default + child_timeout_seconds: 0 # 默认:0 = 无超时 + # child_timeout_seconds: 1800 # 选择启用的硬性上限(下限 30 秒) ``` -对于快速本地模型可降低此值;对于处理难题的慢速推理模型可提高此值。计时器在子智能体每次发起 API 调用或工具调用时重置——只有真正空闲的工作线程才会触发终止。 +正值会对每个子智能体强制执行挂钟时间硬限制;`0` 或负值表示禁用。 :::tip 零调用超时时的诊断转储 -如果子智能体在**零次** API 调用的情况下超时(通常原因:provider 不可达、认证失败或工具 schema 被拒绝),`delegate_task` 会将结构化诊断信息写入 `~/.hermes/logs/subagent-timeout--.log`,其中包含子智能体的配置快照、凭据解析追踪以及早期错误消息。比之前的静默超时行为更易于定位根因。 +在配置了硬性上限的情况下,如果子智能体在**零次** API 调用的情况下超时(通常原因:provider 不可达、认证失败或工具 schema 被拒绝),`delegate_task` 会将结构化诊断信息写入 `~/.hermes/logs/subagent-timeout--.log`,其中包含子智能体的配置快照、凭据解析追踪以及早期错误消息。比之前的静默超时行为更易于定位根因。 ::: ## 监控运行中的子智能体(`/agents`)