From 432a691758083994a043a5698fa109f81648693d Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 28 May 2026 03:34:47 -0700 Subject: [PATCH] fix(update): stream + idle-kill `npm run build` so a stalled webui-build can't soft-brick the install (#33803) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `hermes update` ran the webui build with `capture_output=True` and no timeout. On low-memory hosts (WSL2's 4 GB default, small VPSes, antivirus stalls) Vite goes silent for minutes; users see a frozen terminal, decide the update is hung, and reboot. The reboot lands *after* `pip install -e .` has already touched the install but *before* the build completes, leaving the `hermes` launcher in place while `hermes_cli` is no longer importable — i.e. `ModuleNotFoundError: No module named 'hermes_cli'` (#33788, same class as #32384). Changes: - New `_run_with_idle_timeout()` helper: streams subprocess output line-by-line (so the user sees Vite progress in real time) and kills the process if no bytes appear on stdout/stderr for 180s. The existing stale-dist fallback (#23817) then serves the previous build instead of failing the update. - `_build_web_ui()` uses the helper for `npm run build` (the actual stall site). `npm install` keeps `subprocess.run` + capture_output to preserve the existing EPERM-retry-on-Windows contract. - Both `cmd_update` call sites print `→ Core update complete. Building dashboard (optional)...` before the webui build. The CLI is fully functional at this point; a webui-build failure only affects `hermes dashboard`. Telegraphing the boundary explicitly stops users from rebooting through the build step. Tests: - `tests/hermes_cli/test_run_with_idle_timeout.py` — 4 tests covering streaming success, nonzero exit, idle-kill, and missing-binary cases. Uses real `subprocess.Popen` on tiny Python scripts; isolated in its own file so per-file canonical-runner parallelism doesn't pair it with the mock-heavy tests. - `tests/hermes_cli/test_web_ui_build.py` — updated existing tests to patch `_run_with_idle_timeout` for the build step in addition to `subprocess.run` for the install step. - `tests/hermes_cli/test_cmd_update.py::test_update_refreshes_repo_and_tui_node_dependencies` — same update. Full suite: `scripts/run_tests.sh tests/hermes_cli/` → 5646 passed, 0 failed. Fixes #33788. --- hermes_cli/main.py | 138 +++++++++++++++--- tests/hermes_cli/test_cmd_update.py | 26 +++- .../hermes_cli/test_run_with_idle_timeout.py | 67 +++++++++ tests/hermes_cli/test_web_ui_build.py | 60 +++++--- 4 files changed, 246 insertions(+), 45 deletions(-) create mode 100644 tests/hermes_cli/test_run_with_idle_timeout.py diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 4d9b37db240..4490d59cbd6 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -6509,6 +6509,104 @@ def _web_ui_build_needed(web_dir: Path) -> bool: return False +def _run_with_idle_timeout( + cmd: list[str], + cwd: Path, + *, + idle_timeout_seconds: int = 180, + indent: str = " ", +) -> subprocess.CompletedProcess: + """Run a subprocess that streams output, with an idle-output timeout. + + Issue #33788: ``npm run build`` (Vite) was invoked with + ``capture_output=True`` and no timeout. On low-memory hosts (notably + WSL2 with the default 4 GB cap) the build can stall or sit silent for + minutes; users see a frozen terminal, assume the update is hung, and + reboot — leaving the editable install in a half-state with the + ``hermes`` launcher present but ``hermes_cli`` not importable. + + This helper fixes both halves: stdout is streamed (so the user sees + progress), and if no bytes have appeared on stdout/stderr for + ``idle_timeout_seconds``, the process is terminated and the call + returns with a non-zero ``returncode``. The caller's existing + stale-dist fallback (#23817) takes over from there. + + Returns a ``CompletedProcess`` with merged stdout (text), empty + stderr, and an integer returncode. Never raises on idle timeout — + propagation of failure is via the returncode. + """ + merged_chunks: list[str] = [] + last_output_ts = _time.monotonic() + lock = threading.Lock() + + try: + proc = subprocess.Popen( + cmd, + cwd=cwd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + encoding="utf-8", + errors="replace", + bufsize=1, + ) + except OSError as exc: + # E.g. npm not on PATH between the which() check and now. + return subprocess.CompletedProcess(cmd, 127, stdout="", stderr=str(exc)) + + def _reader() -> None: + nonlocal last_output_ts + assert proc.stdout is not None + for line in proc.stdout: + try: + print(f"{indent}{line.rstrip()}", flush=True) + except UnicodeEncodeError: + # Windows cp1252 fallback — same pattern as _say(). + enc = getattr(sys.stdout, "encoding", None) or "ascii" + safe = line.rstrip().encode(enc, errors="replace").decode(enc, errors="replace") + print(f"{indent}{safe}", flush=True) + with lock: + merged_chunks.append(line) + last_output_ts = _time.monotonic() + + reader_thread = threading.Thread(target=_reader, daemon=True) + reader_thread.start() + + idle_killed = False + while True: + try: + rc = proc.wait(timeout=5) + break + except subprocess.TimeoutExpired: + with lock: + idle = _time.monotonic() - last_output_ts + if idle > idle_timeout_seconds: + idle_killed = True + proc.terminate() + try: + rc = proc.wait(timeout=3) + except subprocess.TimeoutExpired: + proc.kill() + rc = proc.wait() + break + + # Drain reader so we don't leak the stdout file descriptor. + reader_thread.join(timeout=2) + + combined = "".join(merged_chunks) + if idle_killed: + msg = ( + f"\n ⚠ Build produced no output for {idle_timeout_seconds}s — terminated.\n" + " Common causes: out-of-memory on a low-RAM host (WSL/container),\n" + " a stuck Node process, or an antivirus scan stalling I/O.\n" + ) + combined += msg + # Force a non-zero rc even if terminate() raced with a clean exit. + if rc == 0: + rc = 124 # GNU `timeout` convention + return subprocess.CompletedProcess(cmd, rc, stdout=combined, stderr="") + + def _run_npm_install_deterministic( npm: str, cwd: Path, @@ -6614,31 +6712,26 @@ def _build_web_ui(web_dir: Path, *, fatal: bool = False) -> bool: if fatal: _say(" Run manually: cd web && npm install && npm run build") return False - # First attempt - r2 = subprocess.run( - [npm, "run", "build"], - cwd=web_dir, - capture_output=True, - text=True, - encoding="utf-8", - errors="replace", - ) + # First attempt — stream output via idle-timeout helper (issue #33788). + # capture_output=True on a long Vite build looks identical to a hang; + # users react by rebooting, which leaves the editable install in a + # half-state. Streaming + idle-kill makes failures observable AND + # recoverable (the stale-dist fallback below handles the kill path). + r2 = _run_with_idle_timeout([npm, "run", "build"], cwd=web_dir) if r2.returncode != 0: # Retry once after a short delay — covers boot-time races on Windows # (antivirus scanning Node.js binaries, npm cache not ready, transient # I/O when launched via Scheduled Task at logon). See issue #23817. _time.sleep(3) - r2 = subprocess.run( - [npm, "run", "build"], - cwd=web_dir, - capture_output=True, - text=True, - encoding="utf-8", - errors="replace", - ) + r2 = _run_with_idle_timeout([npm, "run", "build"], cwd=web_dir) if r2.returncode != 0: - stderr_preview = (r2.stderr or "").strip() + # _run_with_idle_timeout merges stderr into stdout; older callers + # using subprocess.run kept them split. Pull from whichever has + # content so the error surfaces regardless of which path produced + # the CompletedProcess. + build_output = (r2.stderr or "") + (r2.stdout or "") + stderr_preview = build_output.strip() stderr_tail = "\n ".join(stderr_preview.splitlines()[-10:]) if stderr_preview else "" dist_dir = web_dir.parent / "hermes_cli" / "web_dist" dist_index = dist_dir / "index.html" @@ -7129,6 +7222,11 @@ def _update_via_zip(args): _install_python_dependencies_with_optional_fallback(pip_cmd) _update_node_dependencies() + # Core (Python deps + git pull / ZIP extract) is now complete; the CLI + # is functional from this point onward. The web UI build below is + # optional — a failure here only affects ``hermes dashboard``. Make + # that visible so users don't panic and reboot mid-build (#33788). + print("→ Core update complete. Building dashboard (optional)...") _build_web_ui(PROJECT_ROOT / "web") # Sync skills @@ -9203,6 +9301,10 @@ def _cmd_update_impl(args, gateway_mode: bool): _refresh_active_lazy_features() _update_node_dependencies() + # See note above (ZIP path): core is now complete, web UI build is + # optional from a CLI perspective. Telegraphing this avoids the + # "stuck at webui-build → reboot → broken install" trap (#33788). + print("→ Core update complete. Building dashboard (optional)...") _build_web_ui(PROJECT_ROOT / "web") print() diff --git a/tests/hermes_cli/test_cmd_update.py b/tests/hermes_cli/test_cmd_update.py index 6610bfc810e..0cb8d033eb8 100644 --- a/tests/hermes_cli/test_cmd_update.py +++ b/tests/hermes_cli/test_cmd_update.py @@ -144,7 +144,13 @@ class TestCmdUpdateBranchFallback: mock_run.side_effect = _make_run_side_effect( branch="main", verify_ok=True, commit_count="1" ) - with patch.object(hm, "_is_termux_env", return_value=False): + # The web UI build runs through _run_with_idle_timeout now (issue + # #33788) so it no longer appears in subprocess.run's call list. + # Mock it so the test doesn't actually shell out to ``tsc``. + import subprocess as _subprocess + build_ok = _subprocess.CompletedProcess([], 0, stdout="", stderr="") + with patch.object(hm, "_is_termux_env", return_value=False), \ + patch.object(hm, "_run_with_idle_timeout", return_value=build_ok) as mock_idle: cmd_update(mock_args) npm_calls = [ @@ -153,10 +159,11 @@ class TestCmdUpdateBranchFallback: if call.args and call.args[0][0] == "/usr/bin/npm" ] - # cmd_update runs npm commands in three locations: - # 1. repo root — slash-command / TUI bridge deps - # 2. ui-tui/ — Ink TUI deps - # 3. web/ — install + "npm run build" for the web frontend + # cmd_update runs npm commands in four locations: + # 1. repo root — slash-command / TUI bridge deps (subprocess.run) + # 2. ui-tui/ — Ink TUI deps (subprocess.run) + # 3. web/ — npm install (subprocess.run) + # 4. web/ — npm run build (_run_with_idle_timeout) # # Repo-root and ui-tui installs intentionally omit `--silent` and run # without `capture_output` so optional postinstall scripts (e.g. @@ -175,11 +182,18 @@ class TestCmdUpdateBranchFallback: (update_flags, PROJECT_ROOT / "ui-tui"), ] if len(npm_calls) > 2: + # Only the web/ install is left in subprocess.run; the build moved + # to _run_with_idle_timeout to make Vite progress visible (#33788). assert npm_calls[2:] == [ (["/usr/bin/npm", "ci", "--silent"], PROJECT_ROOT / "web"), - (["/usr/bin/npm", "run", "build"], PROJECT_ROOT / "web"), ] + # The web UI build itself went through the streaming helper. + mock_idle.assert_called_once() + idle_args, idle_kwargs = mock_idle.call_args + assert idle_args[0] == ["/usr/bin/npm", "run", "build"] + assert idle_kwargs["cwd"] == PROJECT_ROOT / "web" + # Regression for #18840: repo root + ui-tui installs must stream # output (capture_output=False) so postinstall progress is visible # to the user. diff --git a/tests/hermes_cli/test_run_with_idle_timeout.py b/tests/hermes_cli/test_run_with_idle_timeout.py new file mode 100644 index 00000000000..37308f116a4 --- /dev/null +++ b/tests/hermes_cli/test_run_with_idle_timeout.py @@ -0,0 +1,67 @@ +"""Coverage for _run_with_idle_timeout — the streaming subprocess helper. + +Kept in a dedicated test file because the tests spawn real ``subprocess.Popen`` +instances; pytest-isolate runs each test file in its own worker process, so +isolating these here prevents real-Popen state from racing with the +``subprocess.run`` / ``_run_with_idle_timeout`` patches used by +``test_web_ui_build.py``. + +Added for issue #33788: ``hermes update`` got stuck at "webui-build" because +``npm run build`` ran with ``capture_output=True`` and no timeout. The helper +fixes both halves — streams output AND idle-kills the process. +""" + +import sys as _sys +import time + +from hermes_cli.main import _run_with_idle_timeout + + +def test_streams_output_and_returns_zero_on_success(tmp_path): + script = tmp_path / "ok.py" + script.write_text("print('line one'); print('line two')\n") + result = _run_with_idle_timeout( + [_sys.executable, str(script)], cwd=tmp_path, idle_timeout_seconds=10 + ) + assert result.returncode == 0 + assert "line one" in result.stdout + assert "line two" in result.stdout + + +def test_propagates_nonzero_exit(tmp_path): + script = tmp_path / "fail.py" + script.write_text("import sys; print('boom', file=sys.stderr); sys.exit(7)\n") + result = _run_with_idle_timeout( + [_sys.executable, str(script)], cwd=tmp_path, idle_timeout_seconds=10 + ) + assert result.returncode == 7 + # stderr is merged into stdout in the helper. + assert "boom" in result.stdout + + +def test_kills_process_on_idle_timeout(tmp_path): + # Sleeps without printing — exactly the failure mode users see when + # `npm run build` stalls. Idle timeout must terminate it. + script = tmp_path / "stall.py" + script.write_text("import time; time.sleep(30)\n") + + start = time.monotonic() + result = _run_with_idle_timeout( + [_sys.executable, str(script)], + cwd=tmp_path, + idle_timeout_seconds=1, + ) + elapsed = time.monotonic() - start + # Should have died well before the 30s sleep completes. + assert elapsed < 15 + assert result.returncode != 0 + assert "produced no output" in result.stdout + + +def test_returns_127_when_binary_missing(tmp_path): + result = _run_with_idle_timeout( + ["/nonexistent/binary/does/not/exist"], + cwd=tmp_path, + idle_timeout_seconds=5, + ) + assert result.returncode == 127 diff --git a/tests/hermes_cli/test_web_ui_build.py b/tests/hermes_cli/test_web_ui_build.py index 6400075b861..5288ca32592 100644 --- a/tests/hermes_cli/test_web_ui_build.py +++ b/tests/hermes_cli/test_web_ui_build.py @@ -113,12 +113,17 @@ class TestBuildWebUISkipsWhenFresh: web_dir, _ = _make_web_dir(tmp_path) mock_cp = __import__("subprocess").CompletedProcess([], 0, stdout=b"", stderr=b"") + build_ok = __import__("subprocess").CompletedProcess([], 0, stdout="", stderr="") with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \ - patch("hermes_cli.main.subprocess.run", return_value=mock_cp) as mock_run: + patch("hermes_cli.main.subprocess.run", return_value=mock_cp) as mock_run, \ + patch("hermes_cli.main._run_with_idle_timeout", return_value=build_ok) as mock_idle: result = _build_web_ui(web_dir) assert result is True - assert mock_run.call_count == 2 # npm install + npm run build + # npm install goes through subprocess.run; npm run build goes through + # _run_with_idle_timeout (issue #33788). + assert mock_run.call_count == 1 # install only + assert mock_idle.call_count == 1 # build only def test_npm_install_uses_utf8_replace_output_decoding(self, tmp_path): web_dir, _ = _make_web_dir(tmp_path) @@ -134,19 +139,29 @@ class TestBuildWebUISkipsWhenFresh: assert kwargs["encoding"] == "utf-8" assert kwargs["errors"] == "replace" - def test_web_build_uses_utf8_replace_output_decoding(self, tmp_path): + def test_web_build_uses_idle_timeout_helper(self, tmp_path): + """npm run build now goes through _run_with_idle_timeout (issue #33788). + + The install step keeps its capture_output behavior (the existing + retry-on-EPERM contract depends on it); only the long-running build + step is streamed + idle-killed. + """ web_dir, _ = _make_web_dir(tmp_path) - mock_cp = __import__("subprocess").CompletedProcess([], 0, stdout="", stderr="") + install_cp = __import__("subprocess").CompletedProcess([], 0, stdout="", stderr="") + build_cp = __import__("subprocess").CompletedProcess([], 0, stdout="", stderr="") with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \ - patch("hermes_cli.main.subprocess.run", side_effect=[mock_cp, mock_cp]) as mock_run: + patch("hermes_cli.main.subprocess.run", return_value=install_cp), \ + patch("hermes_cli.main._run_with_idle_timeout", return_value=build_cp) as mock_idle: result = _build_web_ui(web_dir) assert result is True - _, build_kwargs = mock_run.call_args_list[1] - assert build_kwargs["text"] is True - assert build_kwargs["encoding"] == "utf-8" - assert build_kwargs["errors"] == "replace" + # Build was invoked through the idle-timeout helper, not subprocess.run. + mock_idle.assert_called_once() + args, kwargs = mock_idle.call_args + # Positional: [npm, "run", "build"]; cwd passed as kwarg. + assert args[0] == ["/usr/bin/npm", "run", "build"] + assert kwargs["cwd"] == web_dir class TestBuildWebUIRetryAndStaleFallback: @@ -155,18 +170,19 @@ class TestBuildWebUIRetryAndStaleFallback: def test_retries_build_once_on_failure(self, tmp_path): web_dir, _ = _make_web_dir(tmp_path) Subprocess = __import__("subprocess") - # install: success; build attempt 1: fail; build attempt 2: success install_ok = Subprocess.CompletedProcess([], 0, stdout="", stderr="") - build_fail = Subprocess.CompletedProcess([], 1, stdout="", stderr="EPERM") + # build attempt 1: fail; build attempt 2: success. + build_fail = Subprocess.CompletedProcess([], 1, stdout="EPERM", stderr="") build_ok = Subprocess.CompletedProcess([], 0, stdout="", stderr="") with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \ patch("hermes_cli.main._time.sleep") as mock_sleep, \ - patch("hermes_cli.main.subprocess.run", - side_effect=[install_ok, build_fail, build_ok]) as mock_run: + patch("hermes_cli.main.subprocess.run", return_value=install_ok), \ + patch("hermes_cli.main._run_with_idle_timeout", + side_effect=[build_fail, build_ok]) as mock_idle: result = _build_web_ui(web_dir) assert result is True - assert mock_run.call_count == 3 # install + build + retry + assert mock_idle.call_count == 2 # build + retry mock_sleep.assert_called_once_with(3) def test_falls_back_to_stale_dist_when_retry_also_fails(self, tmp_path, capsys): @@ -177,11 +193,12 @@ class TestBuildWebUIRetryAndStaleFallback: Subprocess = __import__("subprocess") install_ok = Subprocess.CompletedProcess([], 0, stdout="", stderr="") - build_fail = Subprocess.CompletedProcess([], 1, stdout="", stderr="vite ENOMEM") + build_fail = Subprocess.CompletedProcess([], 1, stdout="vite ENOMEM", stderr="") with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \ patch("hermes_cli.main._time.sleep"), \ - patch("hermes_cli.main.subprocess.run", - side_effect=[install_ok, build_fail, build_fail]): + patch("hermes_cli.main.subprocess.run", return_value=install_ok), \ + patch("hermes_cli.main._run_with_idle_timeout", + side_effect=[build_fail, build_fail]): result = _build_web_ui(web_dir, fatal=True) # MUST return True (serve stale) — issue #23817 — even with fatal=True, @@ -189,18 +206,19 @@ class TestBuildWebUIRetryAndStaleFallback: assert result is True out = capsys.readouterr().out assert "serving stale dist as fallback" in out - assert "vite ENOMEM" in out # stderr surfaced to user + assert "vite ENOMEM" in out # combined output surfaced to user def test_hard_fails_when_no_dist_to_fall_back_to(self, tmp_path, capsys): web_dir, _ = _make_web_dir(tmp_path) Subprocess = __import__("subprocess") install_ok = Subprocess.CompletedProcess([], 0, stdout="", stderr="") - build_fail = Subprocess.CompletedProcess([], 1, stdout="", stderr="vite ENOMEM") + build_fail = Subprocess.CompletedProcess([], 1, stdout="vite ENOMEM", stderr="") with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \ patch("hermes_cli.main._time.sleep"), \ - patch("hermes_cli.main.subprocess.run", - side_effect=[install_ok, build_fail, build_fail]): + patch("hermes_cli.main.subprocess.run", return_value=install_ok), \ + patch("hermes_cli.main._run_with_idle_timeout", + side_effect=[build_fail, build_fail]): result = _build_web_ui(web_dir, fatal=True) assert result is False