fix(install): time-box desktop + node-deps installs so a stalled download self-heals (#39219)

The desktop install step ran npm ci / npm run pack with no wall-clock cap, and
the sibling browser-tools / TUI / agent-browser dependency installs had the same
gap. The Electron binary (~150MB) is fetched from GitHub during the pack; on a
throttled or region-blocked link that download can *stall* rather than fail —
npm never errors and never exits, so the installer sits on "Build desktop app"
(step 9/11) indefinitely with only harmless 'npm warn deprecated' lines visible.
The existing self-heal escalation (cache purge -> dist restore -> npmmirror
fallback) only fires when pack returns non-zero, so a stall bypassed it.

- run_with_timeout (generalized from run_browser_install_with_timeout): GNU
  timeout --foreground -k 10 (Ctrl+C-aware, #35166) / gtimeout for external
  commands, else a pure-shell process-group watchdog so stock macOS (neither
  binary present) is protected. Shell functions (_desktop_pack) always take the
  pure-shell path — the timeout binary can't exec a function. Integer-normalized
  budget + a boundary recheck so a command finishing in the final poll second
  isn't mislabeled 124. The internal wait is guarded so set -e can't abort
  mid-function before the real exit code is computed.
- Wrap the desktop npm ci/install (sharing ONE budget via a computed deadline so
  a stall can't cost 2x DESKTOP_BUILD_TIMEOUT) + all three _desktop_pack attempts
  (DESKTOP_BUILD_TIMEOUT, default 900s), and the browser-tools / TUI / agent-
  browser registry installs (NODE_DEPS_TIMEOUT, default 600s).

A stall now converts to a bounded non-zero exit that feeds the existing mirror
self-heal instead of hanging the whole install.
This commit is contained in:
kshitijk4poor 2026-06-28 14:09:18 +05:30 committed by Teknium
parent c1c179a239
commit 546193aa6d

View file

@ -1886,26 +1886,98 @@ strip_snap_browser_override() {
}
run_browser_install_with_timeout() {
run_with_timeout "$@"
}
# Run a command with a hard wall-clock timeout, returning non-zero if it is
# killed. Prefers GNU coreutils `timeout` (Linux) or `gtimeout` (macOS via
# Homebrew) for an external-command target; otherwise (and always for a shell
# function target, which the `timeout` binary cannot exec) it uses a pure-shell
# watchdog: launch the command in its own process group, poll until it finishes,
# and SIGTERM (then SIGKILL) the whole group on timeout. The pure-shell path is
# what protects the bug-#39219 case — a stalled Electron download on macOS,
# where `timeout` is usually absent — turning an indefinite hang into a non-zero
# exit so callers (install_desktop) can self-heal via the mirror fallback.
#
# $1 (timeout) must be a bare integer number of seconds — the pure-shell loop
# compares it arithmetically (the `timeout` binary would also accept suffixes
# like 15m, but we normalize so both paths share one contract). On timeout the
# return code is 124, matching GNU `timeout`.
run_with_timeout() {
local timeout_seconds="$1"
shift
if command -v timeout >/dev/null 2>&1; then
# GNU `timeout` runs the command in its own process group, so a terminal
# Ctrl+C is delivered to `timeout` but never reaches the child — the
# download looks frozen and ignores Ctrl+C (#35166). `--foreground`
# keeps the command in the shell's foreground group so Ctrl+C reaches
# it; `-k 10` sends SIGKILL 10s after the deadline so a wedged download
# can't outlive the timeout. Both flags are GNU-only — probe once and
# fall back to plain `timeout` on BusyBox (Alpine), and to direct exec
# when `timeout` is absent (stock macOS, where Ctrl+C works natively).
if timeout --foreground -k 10 1 true >/dev/null 2>&1; then
timeout --foreground -k 10 "$timeout_seconds" "$@"
else
timeout "$timeout_seconds" "$@"
# Normalize to a bare integer; fall back to the desktop default if a caller
# ever passes a suffixed/empty value (the pure-shell loop needs an int).
case "$timeout_seconds" in
''|*[!0-9]*) timeout_seconds=900 ;;
esac
# The `timeout` binary can only exec an external command, not a shell
# function. Use it only when the target is NOT a function; functions always
# go through the pure-shell watchdog (which runs them in a subshell of the
# current shell and sees them directly — no fragile env export needed).
if [ "$(type -t "$1" 2>/dev/null)" != "function" ]; then
local timeout_bin=""
if command -v timeout >/dev/null 2>&1; then
timeout_bin="timeout"
elif command -v gtimeout >/dev/null 2>&1; then
timeout_bin="gtimeout"
fi
if [ -n "$timeout_bin" ]; then
# GNU `timeout` runs the command in its own process group, so a
# terminal Ctrl+C is delivered to `timeout` but never reaches the
# child — the download looks frozen and ignores Ctrl+C (#35166).
# `--foreground` keeps the command in the shell's foreground group
# so Ctrl+C reaches it; `-k 10` sends SIGKILL 10s after the deadline
# so a wedged download can't outlive the timeout. Both flags are
# GNU-only — probe once and fall back to plain `timeout` on BusyBox
# (Alpine). When neither binary exists (stock macOS) we drop to the
# pure-shell watchdog below.
if "$timeout_bin" --foreground -k 10 1 true >/dev/null 2>&1; then
"$timeout_bin" --foreground -k 10 "$timeout_seconds" "$@"
else
"$timeout_bin" "$timeout_seconds" "$@"
fi
return $?
fi
else
"$@"
fi
# Pure-shell fallback: run in a new process group so we can kill the whole
# subtree (npm spawns node + the Electron downloader as children).
set -m
( "$@" ) &
local cmd_pid=$!
set +m
local waited=0
local rc
while [ "$waited" -lt "$timeout_seconds" ]; do
if ! kill -0 "$cmd_pid" 2>/dev/null; then
# `|| rc=$?` keeps the non-zero child status without letting `set -e`
# abort the caller here (this would fire if run_with_timeout were
# ever called outside an if/|| context).
rc=0; wait "$cmd_pid" 2>/dev/null || rc=$?
return "$rc"
fi
sleep 1
waited=$((waited + 1))
done
# Final boundary recheck: the command may have finished during the last
# poll interval — don't kill (and mislabel as 124) a process that already
# exited cleanly in the last second of the budget.
if ! kill -0 "$cmd_pid" 2>/dev/null; then
rc=0; wait "$cmd_pid" 2>/dev/null || rc=$?
return "$rc"
fi
# Timed out: kill the process group (negative PID), escalate to KILL.
kill -TERM "-$cmd_pid" 2>/dev/null || kill -TERM "$cmd_pid" 2>/dev/null || true
sleep 2
kill -KILL "-$cmd_pid" 2>/dev/null || kill -KILL "$cmd_pid" 2>/dev/null || true
wait "$cmd_pid" 2>/dev/null || true
return 124
}
# Return success only when the host is an apt release NEWER than the newest one
@ -2048,8 +2120,10 @@ install_node_deps() {
if [ -f "$INSTALL_DIR/package.json" ]; then
log_info "Installing Node.js dependencies (browser tools)..."
cd "$INSTALL_DIR"
npm install --silent 2>/dev/null || {
log_warn "npm install failed (browser tools may not work)"
# Time-boxed: a stalled registry fetch would otherwise hang here with no
# progress (same #39219 stall class as the desktop build below).
run_with_timeout "$NODE_DEPS_TIMEOUT" npm install --silent || {
log_warn "npm install failed or timed out (browser tools may not work)"
}
log_success "Node.js dependencies installed"
@ -2149,8 +2223,9 @@ install_node_deps() {
if [ -f "$INSTALL_DIR/ui-tui/package.json" ]; then
log_info "Installing TUI dependencies..."
cd "$INSTALL_DIR/ui-tui"
npm install --silent 2>/dev/null || {
log_warn "TUI npm install failed (hermes --tui may not work)"
# Time-boxed: a stalled registry fetch would otherwise hang here (#39219).
run_with_timeout "$NODE_DEPS_TIMEOUT" npm install --silent || {
log_warn "TUI npm install failed or timed out (hermes --tui may not work)"
}
log_success "TUI dependencies installed"
fi
@ -2392,11 +2467,13 @@ ensure_browser() {
log_info "Installing agent-browser..."
local log_file
log_file="$(mktemp)"
if ! "$npm_bin" install -g --prefix "$HERMES_HOME/node" --silent --ignore-scripts \
# Time-boxed (#39219): a stalled npm registry fetch here would otherwise
# hang the installer with no progress, same class as the desktop build.
if ! run_with_timeout "$NODE_DEPS_TIMEOUT" "$npm_bin" install -g --prefix "$HERMES_HOME/node" --silent --ignore-scripts \
"agent-browser@^0.26.0" \
"@askjo/camofox-browser@^1.5.2" \
>"$log_file" 2>&1; then
log_error "npm install failed:"
log_error "npm install failed or timed out:"
cat "$log_file" >&2
rm -f "$log_file"
return 1
@ -2581,6 +2658,18 @@ _desktop_pack() {
# Last-resort Electron mirror after GitHub download fails (#47266).
DESKTOP_ELECTRON_FALLBACK_MIRROR="https://npmmirror.com/mirrors/electron/"
# Per-attempt wall-clock cap for the desktop npm install / electron-builder pack
# (#39219). A stalled (not failed) Electron download on a throttled/blocked link
# never returns, so without this the installer hangs forever on "Build desktop
# app". 900s is generous enough for a slow-but-progressing ~150MB fetch + build;
# override with DESKTOP_BUILD_TIMEOUT for very slow links.
DESKTOP_BUILD_TIMEOUT="${DESKTOP_BUILD_TIMEOUT:-900}"
# Wall-clock cap for the plain registry `npm install`s (browser-tools + TUI
# deps). Same #39219 stall class but no ~150MB Electron binary, so a shorter
# default; override with NODE_DEPS_TIMEOUT for very slow links.
NODE_DEPS_TIMEOUT="${NODE_DEPS_TIMEOUT:-600}"
# Electron package dir — workspace-local nest first, then root hoist.
_electron_dir() {
local install_dir="$1"
@ -2678,8 +2767,28 @@ install_desktop() {
# flake) — leaving tsc/typescript unresolved and `npm run pack`'s
# `tsc -b` failing with no obvious cause. Fall back to `npm install`
# only if `npm ci` is unavailable or the lockfile is out of sync.
#
# Both the install and the build below are wrapped in a hard wall-clock
# timeout (#39219): the Electron binary (~150MB) is fetched from GitHub,
# and on a throttled/blocked connection that download can *stall* — npm
# neither errors nor exits, so the installer sits on "Build desktop app"
# forever with only `npm warn deprecated` lines visible. A stall now
# converts to a non-zero exit, which feeds the existing self-heal /
# mirror-fallback escalation instead of hanging the whole install.
#
# The `npm ci` and its `npm install` fallback SHARE one budget: a stalled
# link wedges both identically, so giving each a full DESKTOP_BUILD_TIMEOUT
# would double the worst-case hang. We compute a single deadline and pass
# the remaining seconds to the fallback (min 30s so it still gets a real
# attempt if `npm ci` failed fast rather than stalling).
log_info "Installing desktop workspace dependencies (includes Electron ~150MB, 1-3min)..."
if ( cd "$INSTALL_DIR" && npm ci ) || ( cd "$INSTALL_DIR" && npm install ); then
local _deps_start _deps_remaining
_deps_start=$(date +%s)
if run_with_timeout "$DESKTOP_BUILD_TIMEOUT" bash -c 'cd "$1" && npm ci' _ "$INSTALL_DIR"; then
log_success "Desktop workspace dependencies installed"
elif _deps_remaining=$(( DESKTOP_BUILD_TIMEOUT - ($(date +%s) - _deps_start) )); \
[ "$_deps_remaining" -lt 30 ] && _deps_remaining=30; \
run_with_timeout "$_deps_remaining" bash -c 'cd "$1" && npm install' _ "$INSTALL_DIR"; then
log_success "Desktop workspace dependencies installed"
elif _electron_pkg_staged_missing_dist "$INSTALL_DIR"; then
log_warn "Desktop dependency install failed with a missing Electron dist; attempting self-heal..."
@ -2708,7 +2817,7 @@ install_desktop() {
# the GitHub-blocked/throttled case (the repeating "retrying" log).
log_info "Building desktop app (this takes 1-3 minutes)..."
local pack_ok=false
if _desktop_pack "$desktop_dir"; then
if run_with_timeout "$DESKTOP_BUILD_TIMEOUT" _desktop_pack "$desktop_dir"; then
pack_ok=true
else
local purged=""
@ -2719,7 +2828,7 @@ install_desktop() {
fi
if [ "$restored" = true ]; then
log_warn "Desktop build failed; refreshed the Electron download and retrying once..."
if _desktop_pack "$desktop_dir"; then
if run_with_timeout "$DESKTOP_BUILD_TIMEOUT" _desktop_pack "$desktop_dir"; then
pack_ok=true
fi
fi
@ -2731,7 +2840,7 @@ install_desktop() {
log_warn "Re-downloading Electron via a public mirror ($DESKTOP_ELECTRON_FALLBACK_MIRROR), then rebuilding..."
log_warn " (set ELECTRON_MIRROR yourself to use a different/trusted mirror)"
_electron_dist_ok "$INSTALL_DIR" || _restore_electron_dist "$INSTALL_DIR" "$DESKTOP_ELECTRON_FALLBACK_MIRROR" || true
if _desktop_pack "$desktop_dir" "$DESKTOP_ELECTRON_FALLBACK_MIRROR"; then
if run_with_timeout "$DESKTOP_BUILD_TIMEOUT" _desktop_pack "$desktop_dir" "$DESKTOP_ELECTRON_FALLBACK_MIRROR"; then
pack_ok=true
fi
fi