mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
fix(install): time-box desktop + node-deps installs so a stalled download self-heals (#39219)
The desktop install step ran npm ci / npm run pack with no wall-clock cap, and the sibling browser-tools / TUI / agent-browser dependency installs had the same gap. The Electron binary (~150MB) is fetched from GitHub during the pack; on a throttled or region-blocked link that download can *stall* rather than fail — npm never errors and never exits, so the installer sits on "Build desktop app" (step 9/11) indefinitely with only harmless 'npm warn deprecated' lines visible. The existing self-heal escalation (cache purge -> dist restore -> npmmirror fallback) only fires when pack returns non-zero, so a stall bypassed it. - run_with_timeout (generalized from run_browser_install_with_timeout): GNU timeout --foreground -k 10 (Ctrl+C-aware, #35166) / gtimeout for external commands, else a pure-shell process-group watchdog so stock macOS (neither binary present) is protected. Shell functions (_desktop_pack) always take the pure-shell path — the timeout binary can't exec a function. Integer-normalized budget + a boundary recheck so a command finishing in the final poll second isn't mislabeled 124. The internal wait is guarded so set -e can't abort mid-function before the real exit code is computed. - Wrap the desktop npm ci/install (sharing ONE budget via a computed deadline so a stall can't cost 2x DESKTOP_BUILD_TIMEOUT) + all three _desktop_pack attempts (DESKTOP_BUILD_TIMEOUT, default 900s), and the browser-tools / TUI / agent- browser registry installs (NODE_DEPS_TIMEOUT, default 600s). A stall now converts to a bounded non-zero exit that feeds the existing mirror self-heal instead of hanging the whole install.
This commit is contained in:
parent
c1c179a239
commit
546193aa6d
1 changed files with 134 additions and 25 deletions
|
|
@ -1886,26 +1886,98 @@ strip_snap_browser_override() {
|
|||
}
|
||||
|
||||
run_browser_install_with_timeout() {
|
||||
run_with_timeout "$@"
|
||||
}
|
||||
|
||||
# Run a command with a hard wall-clock timeout, returning non-zero if it is
|
||||
# killed. Prefers GNU coreutils `timeout` (Linux) or `gtimeout` (macOS via
|
||||
# Homebrew) for an external-command target; otherwise (and always for a shell
|
||||
# function target, which the `timeout` binary cannot exec) it uses a pure-shell
|
||||
# watchdog: launch the command in its own process group, poll until it finishes,
|
||||
# and SIGTERM (then SIGKILL) the whole group on timeout. The pure-shell path is
|
||||
# what protects the bug-#39219 case — a stalled Electron download on macOS,
|
||||
# where `timeout` is usually absent — turning an indefinite hang into a non-zero
|
||||
# exit so callers (install_desktop) can self-heal via the mirror fallback.
|
||||
#
|
||||
# $1 (timeout) must be a bare integer number of seconds — the pure-shell loop
|
||||
# compares it arithmetically (the `timeout` binary would also accept suffixes
|
||||
# like 15m, but we normalize so both paths share one contract). On timeout the
|
||||
# return code is 124, matching GNU `timeout`.
|
||||
run_with_timeout() {
|
||||
local timeout_seconds="$1"
|
||||
shift
|
||||
|
||||
if command -v timeout >/dev/null 2>&1; then
|
||||
# GNU `timeout` runs the command in its own process group, so a terminal
|
||||
# Ctrl+C is delivered to `timeout` but never reaches the child — the
|
||||
# download looks frozen and ignores Ctrl+C (#35166). `--foreground`
|
||||
# keeps the command in the shell's foreground group so Ctrl+C reaches
|
||||
# it; `-k 10` sends SIGKILL 10s after the deadline so a wedged download
|
||||
# can't outlive the timeout. Both flags are GNU-only — probe once and
|
||||
# fall back to plain `timeout` on BusyBox (Alpine), and to direct exec
|
||||
# when `timeout` is absent (stock macOS, where Ctrl+C works natively).
|
||||
if timeout --foreground -k 10 1 true >/dev/null 2>&1; then
|
||||
timeout --foreground -k 10 "$timeout_seconds" "$@"
|
||||
else
|
||||
timeout "$timeout_seconds" "$@"
|
||||
# Normalize to a bare integer; fall back to the desktop default if a caller
|
||||
# ever passes a suffixed/empty value (the pure-shell loop needs an int).
|
||||
case "$timeout_seconds" in
|
||||
''|*[!0-9]*) timeout_seconds=900 ;;
|
||||
esac
|
||||
|
||||
# The `timeout` binary can only exec an external command, not a shell
|
||||
# function. Use it only when the target is NOT a function; functions always
|
||||
# go through the pure-shell watchdog (which runs them in a subshell of the
|
||||
# current shell and sees them directly — no fragile env export needed).
|
||||
if [ "$(type -t "$1" 2>/dev/null)" != "function" ]; then
|
||||
local timeout_bin=""
|
||||
if command -v timeout >/dev/null 2>&1; then
|
||||
timeout_bin="timeout"
|
||||
elif command -v gtimeout >/dev/null 2>&1; then
|
||||
timeout_bin="gtimeout"
|
||||
fi
|
||||
if [ -n "$timeout_bin" ]; then
|
||||
# GNU `timeout` runs the command in its own process group, so a
|
||||
# terminal Ctrl+C is delivered to `timeout` but never reaches the
|
||||
# child — the download looks frozen and ignores Ctrl+C (#35166).
|
||||
# `--foreground` keeps the command in the shell's foreground group
|
||||
# so Ctrl+C reaches it; `-k 10` sends SIGKILL 10s after the deadline
|
||||
# so a wedged download can't outlive the timeout. Both flags are
|
||||
# GNU-only — probe once and fall back to plain `timeout` on BusyBox
|
||||
# (Alpine). When neither binary exists (stock macOS) we drop to the
|
||||
# pure-shell watchdog below.
|
||||
if "$timeout_bin" --foreground -k 10 1 true >/dev/null 2>&1; then
|
||||
"$timeout_bin" --foreground -k 10 "$timeout_seconds" "$@"
|
||||
else
|
||||
"$timeout_bin" "$timeout_seconds" "$@"
|
||||
fi
|
||||
return $?
|
||||
fi
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
|
||||
# Pure-shell fallback: run in a new process group so we can kill the whole
|
||||
# subtree (npm spawns node + the Electron downloader as children).
|
||||
set -m
|
||||
( "$@" ) &
|
||||
local cmd_pid=$!
|
||||
set +m
|
||||
|
||||
local waited=0
|
||||
local rc
|
||||
while [ "$waited" -lt "$timeout_seconds" ]; do
|
||||
if ! kill -0 "$cmd_pid" 2>/dev/null; then
|
||||
# `|| rc=$?` keeps the non-zero child status without letting `set -e`
|
||||
# abort the caller here (this would fire if run_with_timeout were
|
||||
# ever called outside an if/|| context).
|
||||
rc=0; wait "$cmd_pid" 2>/dev/null || rc=$?
|
||||
return "$rc"
|
||||
fi
|
||||
sleep 1
|
||||
waited=$((waited + 1))
|
||||
done
|
||||
|
||||
# Final boundary recheck: the command may have finished during the last
|
||||
# poll interval — don't kill (and mislabel as 124) a process that already
|
||||
# exited cleanly in the last second of the budget.
|
||||
if ! kill -0 "$cmd_pid" 2>/dev/null; then
|
||||
rc=0; wait "$cmd_pid" 2>/dev/null || rc=$?
|
||||
return "$rc"
|
||||
fi
|
||||
|
||||
# Timed out: kill the process group (negative PID), escalate to KILL.
|
||||
kill -TERM "-$cmd_pid" 2>/dev/null || kill -TERM "$cmd_pid" 2>/dev/null || true
|
||||
sleep 2
|
||||
kill -KILL "-$cmd_pid" 2>/dev/null || kill -KILL "$cmd_pid" 2>/dev/null || true
|
||||
wait "$cmd_pid" 2>/dev/null || true
|
||||
return 124
|
||||
}
|
||||
|
||||
# Return success only when the host is an apt release NEWER than the newest one
|
||||
|
|
@ -2048,8 +2120,10 @@ install_node_deps() {
|
|||
if [ -f "$INSTALL_DIR/package.json" ]; then
|
||||
log_info "Installing Node.js dependencies (browser tools)..."
|
||||
cd "$INSTALL_DIR"
|
||||
npm install --silent 2>/dev/null || {
|
||||
log_warn "npm install failed (browser tools may not work)"
|
||||
# Time-boxed: a stalled registry fetch would otherwise hang here with no
|
||||
# progress (same #39219 stall class as the desktop build below).
|
||||
run_with_timeout "$NODE_DEPS_TIMEOUT" npm install --silent || {
|
||||
log_warn "npm install failed or timed out (browser tools may not work)"
|
||||
}
|
||||
log_success "Node.js dependencies installed"
|
||||
|
||||
|
|
@ -2149,8 +2223,9 @@ install_node_deps() {
|
|||
if [ -f "$INSTALL_DIR/ui-tui/package.json" ]; then
|
||||
log_info "Installing TUI dependencies..."
|
||||
cd "$INSTALL_DIR/ui-tui"
|
||||
npm install --silent 2>/dev/null || {
|
||||
log_warn "TUI npm install failed (hermes --tui may not work)"
|
||||
# Time-boxed: a stalled registry fetch would otherwise hang here (#39219).
|
||||
run_with_timeout "$NODE_DEPS_TIMEOUT" npm install --silent || {
|
||||
log_warn "TUI npm install failed or timed out (hermes --tui may not work)"
|
||||
}
|
||||
log_success "TUI dependencies installed"
|
||||
fi
|
||||
|
|
@ -2392,11 +2467,13 @@ ensure_browser() {
|
|||
log_info "Installing agent-browser..."
|
||||
local log_file
|
||||
log_file="$(mktemp)"
|
||||
if ! "$npm_bin" install -g --prefix "$HERMES_HOME/node" --silent --ignore-scripts \
|
||||
# Time-boxed (#39219): a stalled npm registry fetch here would otherwise
|
||||
# hang the installer with no progress, same class as the desktop build.
|
||||
if ! run_with_timeout "$NODE_DEPS_TIMEOUT" "$npm_bin" install -g --prefix "$HERMES_HOME/node" --silent --ignore-scripts \
|
||||
"agent-browser@^0.26.0" \
|
||||
"@askjo/camofox-browser@^1.5.2" \
|
||||
>"$log_file" 2>&1; then
|
||||
log_error "npm install failed:"
|
||||
log_error "npm install failed or timed out:"
|
||||
cat "$log_file" >&2
|
||||
rm -f "$log_file"
|
||||
return 1
|
||||
|
|
@ -2581,6 +2658,18 @@ _desktop_pack() {
|
|||
# Last-resort Electron mirror after GitHub download fails (#47266).
|
||||
DESKTOP_ELECTRON_FALLBACK_MIRROR="https://npmmirror.com/mirrors/electron/"
|
||||
|
||||
# Per-attempt wall-clock cap for the desktop npm install / electron-builder pack
|
||||
# (#39219). A stalled (not failed) Electron download on a throttled/blocked link
|
||||
# never returns, so without this the installer hangs forever on "Build desktop
|
||||
# app". 900s is generous enough for a slow-but-progressing ~150MB fetch + build;
|
||||
# override with DESKTOP_BUILD_TIMEOUT for very slow links.
|
||||
DESKTOP_BUILD_TIMEOUT="${DESKTOP_BUILD_TIMEOUT:-900}"
|
||||
|
||||
# Wall-clock cap for the plain registry `npm install`s (browser-tools + TUI
|
||||
# deps). Same #39219 stall class but no ~150MB Electron binary, so a shorter
|
||||
# default; override with NODE_DEPS_TIMEOUT for very slow links.
|
||||
NODE_DEPS_TIMEOUT="${NODE_DEPS_TIMEOUT:-600}"
|
||||
|
||||
# Electron package dir — workspace-local nest first, then root hoist.
|
||||
_electron_dir() {
|
||||
local install_dir="$1"
|
||||
|
|
@ -2678,8 +2767,28 @@ install_desktop() {
|
|||
# flake) — leaving tsc/typescript unresolved and `npm run pack`'s
|
||||
# `tsc -b` failing with no obvious cause. Fall back to `npm install`
|
||||
# only if `npm ci` is unavailable or the lockfile is out of sync.
|
||||
#
|
||||
# Both the install and the build below are wrapped in a hard wall-clock
|
||||
# timeout (#39219): the Electron binary (~150MB) is fetched from GitHub,
|
||||
# and on a throttled/blocked connection that download can *stall* — npm
|
||||
# neither errors nor exits, so the installer sits on "Build desktop app"
|
||||
# forever with only `npm warn deprecated` lines visible. A stall now
|
||||
# converts to a non-zero exit, which feeds the existing self-heal /
|
||||
# mirror-fallback escalation instead of hanging the whole install.
|
||||
#
|
||||
# The `npm ci` and its `npm install` fallback SHARE one budget: a stalled
|
||||
# link wedges both identically, so giving each a full DESKTOP_BUILD_TIMEOUT
|
||||
# would double the worst-case hang. We compute a single deadline and pass
|
||||
# the remaining seconds to the fallback (min 30s so it still gets a real
|
||||
# attempt if `npm ci` failed fast rather than stalling).
|
||||
log_info "Installing desktop workspace dependencies (includes Electron ~150MB, 1-3min)..."
|
||||
if ( cd "$INSTALL_DIR" && npm ci ) || ( cd "$INSTALL_DIR" && npm install ); then
|
||||
local _deps_start _deps_remaining
|
||||
_deps_start=$(date +%s)
|
||||
if run_with_timeout "$DESKTOP_BUILD_TIMEOUT" bash -c 'cd "$1" && npm ci' _ "$INSTALL_DIR"; then
|
||||
log_success "Desktop workspace dependencies installed"
|
||||
elif _deps_remaining=$(( DESKTOP_BUILD_TIMEOUT - ($(date +%s) - _deps_start) )); \
|
||||
[ "$_deps_remaining" -lt 30 ] && _deps_remaining=30; \
|
||||
run_with_timeout "$_deps_remaining" bash -c 'cd "$1" && npm install' _ "$INSTALL_DIR"; then
|
||||
log_success "Desktop workspace dependencies installed"
|
||||
elif _electron_pkg_staged_missing_dist "$INSTALL_DIR"; then
|
||||
log_warn "Desktop dependency install failed with a missing Electron dist; attempting self-heal..."
|
||||
|
|
@ -2708,7 +2817,7 @@ install_desktop() {
|
|||
# the GitHub-blocked/throttled case (the repeating "retrying" log).
|
||||
log_info "Building desktop app (this takes 1-3 minutes)..."
|
||||
local pack_ok=false
|
||||
if _desktop_pack "$desktop_dir"; then
|
||||
if run_with_timeout "$DESKTOP_BUILD_TIMEOUT" _desktop_pack "$desktop_dir"; then
|
||||
pack_ok=true
|
||||
else
|
||||
local purged=""
|
||||
|
|
@ -2719,7 +2828,7 @@ install_desktop() {
|
|||
fi
|
||||
if [ "$restored" = true ]; then
|
||||
log_warn "Desktop build failed; refreshed the Electron download and retrying once..."
|
||||
if _desktop_pack "$desktop_dir"; then
|
||||
if run_with_timeout "$DESKTOP_BUILD_TIMEOUT" _desktop_pack "$desktop_dir"; then
|
||||
pack_ok=true
|
||||
fi
|
||||
fi
|
||||
|
|
@ -2731,7 +2840,7 @@ install_desktop() {
|
|||
log_warn "Re-downloading Electron via a public mirror ($DESKTOP_ELECTRON_FALLBACK_MIRROR), then rebuilding..."
|
||||
log_warn " (set ELECTRON_MIRROR yourself to use a different/trusted mirror)"
|
||||
_electron_dist_ok "$INSTALL_DIR" || _restore_electron_dist "$INSTALL_DIR" "$DESKTOP_ELECTRON_FALLBACK_MIRROR" || true
|
||||
if _desktop_pack "$desktop_dir" "$DESKTOP_ELECTRON_FALLBACK_MIRROR"; then
|
||||
if run_with_timeout "$DESKTOP_BUILD_TIMEOUT" _desktop_pack "$desktop_dir" "$DESKTOP_ELECTRON_FALLBACK_MIRROR"; then
|
||||
pack_ok=true
|
||||
fi
|
||||
fi
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue