From 432a691758083994a043a5698fa109f81648693d Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Thu, 28 May 2026 03:34:47 -0700
Subject: [PATCH] fix(update): stream + idle-kill `npm run build` so a stalled
 webui-build can't soft-brick the install (#33803)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`hermes update` ran the webui build with `capture_output=True` and no timeout. On low-memory hosts (WSL2's 4 GB default, small VPSes, antivirus stalls) Vite goes silent for minutes; users see a frozen terminal, decide the update is hung, and reboot. The reboot lands *after* `pip install -e .` has already touched the install but *before* the build completes, leaving the `hermes` launcher in place while `hermes_cli` is no longer importable — i.e. `ModuleNotFoundError: No module named 'hermes_cli'` (#33788, same class as #32384).

Changes:

- New `_run_with_idle_timeout()` helper: streams subprocess output line-by-line (so the user sees Vite progress in real time) and kills the process if no bytes appear on stdout/stderr for 180s. The existing stale-dist fallback (#23817) then serves the previous build instead of failing the update.
- `_build_web_ui()` uses the helper for `npm run build` (the actual stall site). `npm install` keeps `subprocess.run` + capture_output to preserve the existing EPERM-retry-on-Windows contract.
- Both `cmd_update` call sites print `→ Core update complete. Building dashboard (optional)...` before the webui build. The CLI is fully functional at this point; a webui-build failure only affects `hermes dashboard`. Telegraphing the boundary explicitly stops users from rebooting through the build step.

Tests:

- `tests/hermes_cli/test_run_with_idle_timeout.py` — 4 tests covering streaming success, nonzero exit, idle-kill, and missing-binary cases. Uses real `subprocess.Popen` on tiny Python scripts; isolated in its own file so per-file canonical-runner parallelism doesn't pair it with the mock-heavy tests.
- `tests/hermes_cli/test_web_ui_build.py` — updated existing tests to patch `_run_with_idle_timeout` for the build step in addition to `subprocess.run` for the install step.
- `tests/hermes_cli/test_cmd_update.py::test_update_refreshes_repo_and_tui_node_dependencies` — same update.

Full suite: `scripts/run_tests.sh tests/hermes_cli/` → 5646 passed, 0 failed.

Fixes #33788.
---
 hermes_cli/main.py                            | 138 +++++++++++++++---
 tests/hermes_cli/test_cmd_update.py           |  26 +++-
 .../hermes_cli/test_run_with_idle_timeout.py  |  67 +++++++++
 tests/hermes_cli/test_web_ui_build.py         |  60 +++++---
 4 files changed, 246 insertions(+), 45 deletions(-)
 create mode 100644 tests/hermes_cli/test_run_with_idle_timeout.py

diff --git a/hermes_cli/main.py b/hermes_cli/main.py
index 4d9b37db240..4490d59cbd6 100644
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -6509,6 +6509,104 @@ def _web_ui_build_needed(web_dir: Path) -> bool:
     return False
 
 
+def _run_with_idle_timeout(
+    cmd: list[str],
+    cwd: Path,
+    *,
+    idle_timeout_seconds: int = 180,
+    indent: str = "    ",
+) -> subprocess.CompletedProcess:
+    """Run a subprocess that streams output, with an idle-output timeout.
+
+    Issue #33788: ``npm run build`` (Vite) was invoked with
+    ``capture_output=True`` and no timeout. On low-memory hosts (notably
+    WSL2 with the default 4 GB cap) the build can stall or sit silent for
+    minutes; users see a frozen terminal, assume the update is hung, and
+    reboot — leaving the editable install in a half-state with the
+    ``hermes`` launcher present but ``hermes_cli`` not importable.
+
+    This helper fixes both halves: stdout is streamed (so the user sees
+    progress), and if no bytes have appeared on stdout/stderr for
+    ``idle_timeout_seconds``, the process is terminated and the call
+    returns with a non-zero ``returncode``. The caller's existing
+    stale-dist fallback (#23817) takes over from there.
+
+    Returns a ``CompletedProcess`` with merged stdout (text), empty
+    stderr, and an integer returncode. Never raises on idle timeout —
+    propagation of failure is via the returncode.
+    """
+    merged_chunks: list[str] = []
+    last_output_ts = _time.monotonic()
+    lock = threading.Lock()
+
+    try:
+        proc = subprocess.Popen(
+            cmd,
+            cwd=cwd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            encoding="utf-8",
+            errors="replace",
+            bufsize=1,
+        )
+    except OSError as exc:
+        # E.g. npm not on PATH between the which() check and now.
+        return subprocess.CompletedProcess(cmd, 127, stdout="", stderr=str(exc))
+
+    def _reader() -> None:
+        nonlocal last_output_ts
+        assert proc.stdout is not None
+        for line in proc.stdout:
+            try:
+                print(f"{indent}{line.rstrip()}", flush=True)
+            except UnicodeEncodeError:
+                # Windows cp1252 fallback — same pattern as _say().
+                enc = getattr(sys.stdout, "encoding", None) or "ascii"
+                safe = line.rstrip().encode(enc, errors="replace").decode(enc, errors="replace")
+                print(f"{indent}{safe}", flush=True)
+            with lock:
+                merged_chunks.append(line)
+                last_output_ts = _time.monotonic()
+
+    reader_thread = threading.Thread(target=_reader, daemon=True)
+    reader_thread.start()
+
+    idle_killed = False
+    while True:
+        try:
+            rc = proc.wait(timeout=5)
+            break
+        except subprocess.TimeoutExpired:
+            with lock:
+                idle = _time.monotonic() - last_output_ts
+            if idle > idle_timeout_seconds:
+                idle_killed = True
+                proc.terminate()
+                try:
+                    rc = proc.wait(timeout=3)
+                except subprocess.TimeoutExpired:
+                    proc.kill()
+                    rc = proc.wait()
+                break
+
+    # Drain reader so we don't leak the stdout file descriptor.
+    reader_thread.join(timeout=2)
+
+    combined = "".join(merged_chunks)
+    if idle_killed:
+        msg = (
+            f"\n  ⚠ Build produced no output for {idle_timeout_seconds}s — terminated.\n"
+            "    Common causes: out-of-memory on a low-RAM host (WSL/container),\n"
+            "    a stuck Node process, or an antivirus scan stalling I/O.\n"
+        )
+        combined += msg
+        # Force a non-zero rc even if terminate() raced with a clean exit.
+        if rc == 0:
+            rc = 124  # GNU `timeout` convention
+    return subprocess.CompletedProcess(cmd, rc, stdout=combined, stderr="")
+
+
 def _run_npm_install_deterministic(
     npm: str,
     cwd: Path,
@@ -6614,31 +6712,26 @@ def _build_web_ui(web_dir: Path, *, fatal: bool = False) -> bool:
         if fatal:
             _say("  Run manually:  cd web && npm install && npm run build")
         return False
-    # First attempt
-    r2 = subprocess.run(
-        [npm, "run", "build"],
-        cwd=web_dir,
-        capture_output=True,
-        text=True,
-        encoding="utf-8",
-        errors="replace",
-    )
+    # First attempt — stream output via idle-timeout helper (issue #33788).
+    # capture_output=True on a long Vite build looks identical to a hang;
+    # users react by rebooting, which leaves the editable install in a
+    # half-state. Streaming + idle-kill makes failures observable AND
+    # recoverable (the stale-dist fallback below handles the kill path).
+    r2 = _run_with_idle_timeout([npm, "run", "build"], cwd=web_dir)
     if r2.returncode != 0:
         # Retry once after a short delay — covers boot-time races on Windows
         # (antivirus scanning Node.js binaries, npm cache not ready, transient
         # I/O when launched via Scheduled Task at logon). See issue #23817.
         _time.sleep(3)
-        r2 = subprocess.run(
-            [npm, "run", "build"],
-            cwd=web_dir,
-            capture_output=True,
-            text=True,
-            encoding="utf-8",
-            errors="replace",
-        )
+        r2 = _run_with_idle_timeout([npm, "run", "build"], cwd=web_dir)
 
     if r2.returncode != 0:
-        stderr_preview = (r2.stderr or "").strip()
+        # _run_with_idle_timeout merges stderr into stdout; older callers
+        # using subprocess.run kept them split. Pull from whichever has
+        # content so the error surfaces regardless of which path produced
+        # the CompletedProcess.
+        build_output = (r2.stderr or "") + (r2.stdout or "")
+        stderr_preview = build_output.strip()
         stderr_tail = "\n  ".join(stderr_preview.splitlines()[-10:]) if stderr_preview else ""
         dist_dir = web_dir.parent / "hermes_cli" / "web_dist"
         dist_index = dist_dir / "index.html"
@@ -7129,6 +7222,11 @@ def _update_via_zip(args):
         _install_python_dependencies_with_optional_fallback(pip_cmd)
 
     _update_node_dependencies()
+    # Core (Python deps + git pull / ZIP extract) is now complete; the CLI
+    # is functional from this point onward. The web UI build below is
+    # optional — a failure here only affects ``hermes dashboard``. Make
+    # that visible so users don't panic and reboot mid-build (#33788).
+    print("→ Core update complete. Building dashboard (optional)...")
     _build_web_ui(PROJECT_ROOT / "web")
 
     # Sync skills
@@ -9203,6 +9301,10 @@ def _cmd_update_impl(args, gateway_mode: bool):
         _refresh_active_lazy_features()
 
         _update_node_dependencies()
+        # See note above (ZIP path): core is now complete, web UI build is
+        # optional from a CLI perspective. Telegraphing this avoids the
+        # "stuck at webui-build → reboot → broken install" trap (#33788).
+        print("→ Core update complete. Building dashboard (optional)...")
         _build_web_ui(PROJECT_ROOT / "web")
 
         print()
diff --git a/tests/hermes_cli/test_cmd_update.py b/tests/hermes_cli/test_cmd_update.py
index 6610bfc810e..0cb8d033eb8 100644
--- a/tests/hermes_cli/test_cmd_update.py
+++ b/tests/hermes_cli/test_cmd_update.py
@@ -144,7 +144,13 @@ class TestCmdUpdateBranchFallback:
         mock_run.side_effect = _make_run_side_effect(
             branch="main", verify_ok=True, commit_count="1"
         )
-        with patch.object(hm, "_is_termux_env", return_value=False):
+        # The web UI build runs through _run_with_idle_timeout now (issue
+        # #33788) so it no longer appears in subprocess.run's call list.
+        # Mock it so the test doesn't actually shell out to ``tsc``.
+        import subprocess as _subprocess
+        build_ok = _subprocess.CompletedProcess([], 0, stdout="", stderr="")
+        with patch.object(hm, "_is_termux_env", return_value=False), \
+             patch.object(hm, "_run_with_idle_timeout", return_value=build_ok) as mock_idle:
             cmd_update(mock_args)
 
         npm_calls = [
@@ -153,10 +159,11 @@ class TestCmdUpdateBranchFallback:
             if call.args and call.args[0][0] == "/usr/bin/npm"
         ]
 
-        # cmd_update runs npm commands in three locations:
-        #   1. repo root  — slash-command / TUI bridge deps
-        #   2. ui-tui/    — Ink TUI deps
-        #   3. web/       — install + "npm run build" for the web frontend
+        # cmd_update runs npm commands in four locations:
+        #   1. repo root  — slash-command / TUI bridge deps  (subprocess.run)
+        #   2. ui-tui/    — Ink TUI deps                     (subprocess.run)
+        #   3. web/       — npm install                      (subprocess.run)
+        #   4. web/       — npm run build                    (_run_with_idle_timeout)
         #
         # Repo-root and ui-tui installs intentionally omit `--silent` and run
         # without `capture_output` so optional postinstall scripts (e.g.
@@ -175,11 +182,18 @@ class TestCmdUpdateBranchFallback:
             (update_flags, PROJECT_ROOT / "ui-tui"),
         ]
         if len(npm_calls) > 2:
+            # Only the web/ install is left in subprocess.run; the build moved
+            # to _run_with_idle_timeout to make Vite progress visible (#33788).
             assert npm_calls[2:] == [
                 (["/usr/bin/npm", "ci", "--silent"], PROJECT_ROOT / "web"),
-                (["/usr/bin/npm", "run", "build"], PROJECT_ROOT / "web"),
             ]
 
+        # The web UI build itself went through the streaming helper.
+        mock_idle.assert_called_once()
+        idle_args, idle_kwargs = mock_idle.call_args
+        assert idle_args[0] == ["/usr/bin/npm", "run", "build"]
+        assert idle_kwargs["cwd"] == PROJECT_ROOT / "web"
+
         # Regression for #18840: repo root + ui-tui installs must stream
         # output (capture_output=False) so postinstall progress is visible
         # to the user.
diff --git a/tests/hermes_cli/test_run_with_idle_timeout.py b/tests/hermes_cli/test_run_with_idle_timeout.py
new file mode 100644
index 00000000000..37308f116a4
--- /dev/null
+++ b/tests/hermes_cli/test_run_with_idle_timeout.py
@@ -0,0 +1,67 @@
+"""Coverage for _run_with_idle_timeout — the streaming subprocess helper.
+
+Kept in a dedicated test file because the tests spawn real ``subprocess.Popen``
+instances; pytest-isolate runs each test file in its own worker process, so
+isolating these here prevents real-Popen state from racing with the
+``subprocess.run`` / ``_run_with_idle_timeout`` patches used by
+``test_web_ui_build.py``.
+
+Added for issue #33788: ``hermes update`` got stuck at "webui-build" because
+``npm run build`` ran with ``capture_output=True`` and no timeout. The helper
+fixes both halves — streams output AND idle-kills the process.
+"""
+
+import sys as _sys
+import time
+
+from hermes_cli.main import _run_with_idle_timeout
+
+
+def test_streams_output_and_returns_zero_on_success(tmp_path):
+    script = tmp_path / "ok.py"
+    script.write_text("print('line one'); print('line two')\n")
+    result = _run_with_idle_timeout(
+        [_sys.executable, str(script)], cwd=tmp_path, idle_timeout_seconds=10
+    )
+    assert result.returncode == 0
+    assert "line one" in result.stdout
+    assert "line two" in result.stdout
+
+
+def test_propagates_nonzero_exit(tmp_path):
+    script = tmp_path / "fail.py"
+    script.write_text("import sys; print('boom', file=sys.stderr); sys.exit(7)\n")
+    result = _run_with_idle_timeout(
+        [_sys.executable, str(script)], cwd=tmp_path, idle_timeout_seconds=10
+    )
+    assert result.returncode == 7
+    # stderr is merged into stdout in the helper.
+    assert "boom" in result.stdout
+
+
+def test_kills_process_on_idle_timeout(tmp_path):
+    # Sleeps without printing — exactly the failure mode users see when
+    # `npm run build` stalls. Idle timeout must terminate it.
+    script = tmp_path / "stall.py"
+    script.write_text("import time; time.sleep(30)\n")
+
+    start = time.monotonic()
+    result = _run_with_idle_timeout(
+        [_sys.executable, str(script)],
+        cwd=tmp_path,
+        idle_timeout_seconds=1,
+    )
+    elapsed = time.monotonic() - start
+    # Should have died well before the 30s sleep completes.
+    assert elapsed < 15
+    assert result.returncode != 0
+    assert "produced no output" in result.stdout
+
+
+def test_returns_127_when_binary_missing(tmp_path):
+    result = _run_with_idle_timeout(
+        ["/nonexistent/binary/does/not/exist"],
+        cwd=tmp_path,
+        idle_timeout_seconds=5,
+    )
+    assert result.returncode == 127
diff --git a/tests/hermes_cli/test_web_ui_build.py b/tests/hermes_cli/test_web_ui_build.py
index 6400075b861..5288ca32592 100644
--- a/tests/hermes_cli/test_web_ui_build.py
+++ b/tests/hermes_cli/test_web_ui_build.py
@@ -113,12 +113,17 @@ class TestBuildWebUISkipsWhenFresh:
         web_dir, _ = _make_web_dir(tmp_path)
 
         mock_cp = __import__("subprocess").CompletedProcess([], 0, stdout=b"", stderr=b"")
+        build_ok = __import__("subprocess").CompletedProcess([], 0, stdout="", stderr="")
         with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \
-             patch("hermes_cli.main.subprocess.run", return_value=mock_cp) as mock_run:
+             patch("hermes_cli.main.subprocess.run", return_value=mock_cp) as mock_run, \
+             patch("hermes_cli.main._run_with_idle_timeout", return_value=build_ok) as mock_idle:
             result = _build_web_ui(web_dir)
 
         assert result is True
-        assert mock_run.call_count == 2  # npm install + npm run build
+        # npm install goes through subprocess.run; npm run build goes through
+        # _run_with_idle_timeout (issue #33788).
+        assert mock_run.call_count == 1   # install only
+        assert mock_idle.call_count == 1  # build only
 
     def test_npm_install_uses_utf8_replace_output_decoding(self, tmp_path):
         web_dir, _ = _make_web_dir(tmp_path)
@@ -134,19 +139,29 @@ class TestBuildWebUISkipsWhenFresh:
         assert kwargs["encoding"] == "utf-8"
         assert kwargs["errors"] == "replace"
 
-    def test_web_build_uses_utf8_replace_output_decoding(self, tmp_path):
+    def test_web_build_uses_idle_timeout_helper(self, tmp_path):
+        """npm run build now goes through _run_with_idle_timeout (issue #33788).
+
+        The install step keeps its capture_output behavior (the existing
+        retry-on-EPERM contract depends on it); only the long-running build
+        step is streamed + idle-killed.
+        """
         web_dir, _ = _make_web_dir(tmp_path)
 
-        mock_cp = __import__("subprocess").CompletedProcess([], 0, stdout="", stderr="")
+        install_cp = __import__("subprocess").CompletedProcess([], 0, stdout="", stderr="")
+        build_cp = __import__("subprocess").CompletedProcess([], 0, stdout="", stderr="")
         with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \
-             patch("hermes_cli.main.subprocess.run", side_effect=[mock_cp, mock_cp]) as mock_run:
+             patch("hermes_cli.main.subprocess.run", return_value=install_cp), \
+             patch("hermes_cli.main._run_with_idle_timeout", return_value=build_cp) as mock_idle:
             result = _build_web_ui(web_dir)
 
         assert result is True
-        _, build_kwargs = mock_run.call_args_list[1]
-        assert build_kwargs["text"] is True
-        assert build_kwargs["encoding"] == "utf-8"
-        assert build_kwargs["errors"] == "replace"
+        # Build was invoked through the idle-timeout helper, not subprocess.run.
+        mock_idle.assert_called_once()
+        args, kwargs = mock_idle.call_args
+        # Positional: [npm, "run", "build"]; cwd passed as kwarg.
+        assert args[0] == ["/usr/bin/npm", "run", "build"]
+        assert kwargs["cwd"] == web_dir
 
 
 class TestBuildWebUIRetryAndStaleFallback:
@@ -155,18 +170,19 @@ class TestBuildWebUIRetryAndStaleFallback:
     def test_retries_build_once_on_failure(self, tmp_path):
         web_dir, _ = _make_web_dir(tmp_path)
         Subprocess = __import__("subprocess")
-        # install: success; build attempt 1: fail; build attempt 2: success
         install_ok = Subprocess.CompletedProcess([], 0, stdout="", stderr="")
-        build_fail = Subprocess.CompletedProcess([], 1, stdout="", stderr="EPERM")
+        # build attempt 1: fail; build attempt 2: success.
+        build_fail = Subprocess.CompletedProcess([], 1, stdout="EPERM", stderr="")
         build_ok = Subprocess.CompletedProcess([], 0, stdout="", stderr="")
         with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \
              patch("hermes_cli.main._time.sleep") as mock_sleep, \
-             patch("hermes_cli.main.subprocess.run",
-                   side_effect=[install_ok, build_fail, build_ok]) as mock_run:
+             patch("hermes_cli.main.subprocess.run", return_value=install_ok), \
+             patch("hermes_cli.main._run_with_idle_timeout",
+                   side_effect=[build_fail, build_ok]) as mock_idle:
             result = _build_web_ui(web_dir)
 
         assert result is True
-        assert mock_run.call_count == 3  # install + build + retry
+        assert mock_idle.call_count == 2  # build + retry
         mock_sleep.assert_called_once_with(3)
 
     def test_falls_back_to_stale_dist_when_retry_also_fails(self, tmp_path, capsys):
@@ -177,11 +193,12 @@ class TestBuildWebUIRetryAndStaleFallback:
 
         Subprocess = __import__("subprocess")
         install_ok = Subprocess.CompletedProcess([], 0, stdout="", stderr="")
-        build_fail = Subprocess.CompletedProcess([], 1, stdout="", stderr="vite ENOMEM")
+        build_fail = Subprocess.CompletedProcess([], 1, stdout="vite ENOMEM", stderr="")
         with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \
              patch("hermes_cli.main._time.sleep"), \
-             patch("hermes_cli.main.subprocess.run",
-                   side_effect=[install_ok, build_fail, build_fail]):
+             patch("hermes_cli.main.subprocess.run", return_value=install_ok), \
+             patch("hermes_cli.main._run_with_idle_timeout",
+                   side_effect=[build_fail, build_fail]):
             result = _build_web_ui(web_dir, fatal=True)
 
         # MUST return True (serve stale) — issue #23817 — even with fatal=True,
@@ -189,18 +206,19 @@ class TestBuildWebUIRetryAndStaleFallback:
         assert result is True
         out = capsys.readouterr().out
         assert "serving stale dist as fallback" in out
-        assert "vite ENOMEM" in out  # stderr surfaced to user
+        assert "vite ENOMEM" in out  # combined output surfaced to user
 
     def test_hard_fails_when_no_dist_to_fall_back_to(self, tmp_path, capsys):
         web_dir, _ = _make_web_dir(tmp_path)
 
         Subprocess = __import__("subprocess")
         install_ok = Subprocess.CompletedProcess([], 0, stdout="", stderr="")
-        build_fail = Subprocess.CompletedProcess([], 1, stdout="", stderr="vite ENOMEM")
+        build_fail = Subprocess.CompletedProcess([], 1, stdout="vite ENOMEM", stderr="")
         with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \
              patch("hermes_cli.main._time.sleep"), \
-             patch("hermes_cli.main.subprocess.run",
-                   side_effect=[install_ok, build_fail, build_fail]):
+             patch("hermes_cli.main.subprocess.run", return_value=install_ok), \
+             patch("hermes_cli.main._run_with_idle_timeout",
+                   side_effect=[build_fail, build_fail]):
             result = _build_web_ui(web_dir, fatal=True)
 
         assert result is False