hermes-agent/tests/hermes_cli/test_web_ui_build.py
Teknium 432a691758
fix(update): stream + idle-kill npm run build so a stalled webui-build can't soft-brick the install (#33803)
`hermes update` ran the webui build with `capture_output=True` and no timeout. On low-memory hosts (WSL2's 4 GB default, small VPSes, antivirus stalls) Vite goes silent for minutes; users see a frozen terminal, decide the update is hung, and reboot. The reboot lands *after* `pip install -e .` has already touched the install but *before* the build completes, leaving the `hermes` launcher in place while `hermes_cli` is no longer importable — i.e. `ModuleNotFoundError: No module named 'hermes_cli'` (#33788, same class as #32384).

Changes:

- New `_run_with_idle_timeout()` helper: streams subprocess output line-by-line (so the user sees Vite progress in real time) and kills the process if no bytes appear on stdout/stderr for 180s. The existing stale-dist fallback (#23817) then serves the previous build instead of failing the update.
- `_build_web_ui()` uses the helper for `npm run build` (the actual stall site). `npm install` keeps `subprocess.run` + capture_output to preserve the existing EPERM-retry-on-Windows contract.
- Both `cmd_update` call sites print `→ Core update complete. Building dashboard (optional)...` before the webui build. The CLI is fully functional at this point; a webui-build failure only affects `hermes dashboard`. Telegraphing the boundary explicitly stops users from rebooting through the build step.

Tests:

- `tests/hermes_cli/test_run_with_idle_timeout.py` — 4 tests covering streaming success, nonzero exit, idle-kill, and missing-binary cases. Uses real `subprocess.Popen` on tiny Python scripts; isolated in its own file so per-file canonical-runner parallelism doesn't pair it with the mock-heavy tests.
- `tests/hermes_cli/test_web_ui_build.py` — updated existing tests to patch `_run_with_idle_timeout` for the build step in addition to `subprocess.run` for the install step.
- `tests/hermes_cli/test_cmd_update.py::test_update_refreshes_repo_and_tui_node_dependencies` — same update.

Full suite: `scripts/run_tests.sh tests/hermes_cli/` → 5646 passed, 0 failed.

Fixes #33788.
2026-05-28 03:34:47 -07:00

228 lines
10 KiB
Python

"""Tests for _web_ui_build_needed — staleness check for the web UI dist.
Critical invariant: the Vite build outputs to hermes_cli/web_dist/
(vite.config.ts: outDir: "../hermes_cli/web_dist"), NOT web/dist/.
The sentinel must be checked in the correct output directory or the
freshness check is a no-op and the OOM rebuild always runs.
"""
import os
import time
from pathlib import Path
from unittest.mock import patch
import pytest
from hermes_cli.main import _web_ui_build_needed, _build_web_ui, _run_npm_install_deterministic
def _touch(path: Path, offset: float = 0.0) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.touch()
if offset:
t = time.time() + offset
os.utime(path, (t, t))
def _make_web_dir(tmp_path: Path) -> tuple[Path, Path]:
"""Return (web_dir, dist_dir) matching real repo layout."""
web_dir = tmp_path / "web"
web_dir.mkdir()
(web_dir / "package.json").touch()
dist_dir = tmp_path / "hermes_cli" / "web_dist"
return web_dir, dist_dir
class TestWebUIBuildNeeded:
def test_returns_true_when_dist_missing(self, tmp_path):
web_dir, _ = _make_web_dir(tmp_path)
assert _web_ui_build_needed(web_dir) is True
def test_returns_false_when_vite_manifest_fresh(self, tmp_path):
web_dir, dist_dir = _make_web_dir(tmp_path)
_touch(web_dir / "src" / "App.tsx", offset=-10)
_touch(dist_dir / ".vite" / "manifest.json")
assert _web_ui_build_needed(web_dir) is False
def test_returns_true_when_source_newer_than_manifest(self, tmp_path):
web_dir, dist_dir = _make_web_dir(tmp_path)
_touch(dist_dir / ".vite" / "manifest.json", offset=-10)
_touch(web_dir / "src" / "App.tsx")
assert _web_ui_build_needed(web_dir) is True
def test_falls_back_to_index_html_when_manifest_missing(self, tmp_path):
web_dir, dist_dir = _make_web_dir(tmp_path)
_touch(web_dir / "src" / "main.ts", offset=-10)
_touch(dist_dir / "index.html")
assert _web_ui_build_needed(web_dir) is False
def test_web_dist_dir_not_web_dist_subdir(self, tmp_path):
"""Regression: sentinel must be in hermes_cli/web_dist/, NOT web/dist/."""
web_dir, dist_dir = _make_web_dir(tmp_path)
_touch(web_dir / "src" / "App.tsx", offset=-10)
# Place manifest in wrong location (web/dist/) — should NOT count as fresh
wrong_dist = web_dir / "dist" / ".vite" / "manifest.json"
_touch(wrong_dist)
# Correct location is empty → still needs build
assert _web_ui_build_needed(web_dir) is True
def test_returns_true_when_package_lock_newer_than_dist(self, tmp_path):
web_dir, dist_dir = _make_web_dir(tmp_path)
_touch(dist_dir / ".vite" / "manifest.json", offset=-10)
_touch(web_dir / "package-lock.json")
assert _web_ui_build_needed(web_dir) is True
def test_returns_true_when_vite_config_newer_than_dist(self, tmp_path):
web_dir, dist_dir = _make_web_dir(tmp_path)
_touch(dist_dir / ".vite" / "manifest.json", offset=-10)
_touch(web_dir / "vite.config.ts")
assert _web_ui_build_needed(web_dir) is True
def test_ignores_node_modules(self, tmp_path):
web_dir, dist_dir = _make_web_dir(tmp_path)
# package.json older than manifest; only node_modules file is newer
_touch(web_dir / "package.json", offset=-20)
_touch(dist_dir / ".vite" / "manifest.json", offset=-10)
_touch(web_dir / "node_modules" / "react" / "index.js")
assert _web_ui_build_needed(web_dir) is False
def test_ignores_dist_subdir_under_web(self, tmp_path):
web_dir, dist_dir = _make_web_dir(tmp_path)
# package.json older than manifest; only web/dist file is newer
_touch(web_dir / "package.json", offset=-20)
_touch(dist_dir / ".vite" / "manifest.json", offset=-10)
_touch(web_dir / "dist" / "assets" / "index.js")
assert _web_ui_build_needed(web_dir) is False
class TestBuildWebUISkipsWhenFresh:
def test_skips_npm_when_dist_is_fresh(self, tmp_path):
web_dir, dist_dir = _make_web_dir(tmp_path)
_touch(dist_dir / ".vite" / "manifest.json")
with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \
patch("hermes_cli.main.subprocess.run") as mock_run:
result = _build_web_ui(web_dir)
assert result is True
mock_run.assert_not_called()
def test_runs_npm_when_dist_missing(self, tmp_path):
web_dir, _ = _make_web_dir(tmp_path)
mock_cp = __import__("subprocess").CompletedProcess([], 0, stdout=b"", stderr=b"")
build_ok = __import__("subprocess").CompletedProcess([], 0, stdout="", stderr="")
with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \
patch("hermes_cli.main.subprocess.run", return_value=mock_cp) as mock_run, \
patch("hermes_cli.main._run_with_idle_timeout", return_value=build_ok) as mock_idle:
result = _build_web_ui(web_dir)
assert result is True
# npm install goes through subprocess.run; npm run build goes through
# _run_with_idle_timeout (issue #33788).
assert mock_run.call_count == 1 # install only
assert mock_idle.call_count == 1 # build only
def test_npm_install_uses_utf8_replace_output_decoding(self, tmp_path):
web_dir, _ = _make_web_dir(tmp_path)
(web_dir / "package-lock.json").write_text("{}", encoding="utf-8")
mock_cp = __import__("subprocess").CompletedProcess([], 0, stdout="", stderr="")
with patch("hermes_cli.main.subprocess.run", return_value=mock_cp) as mock_run:
result = _run_npm_install_deterministic("/usr/bin/npm", web_dir)
assert result.returncode == 0
_, kwargs = mock_run.call_args
assert kwargs["text"] is True
assert kwargs["encoding"] == "utf-8"
assert kwargs["errors"] == "replace"
def test_web_build_uses_idle_timeout_helper(self, tmp_path):
"""npm run build now goes through _run_with_idle_timeout (issue #33788).
The install step keeps its capture_output behavior (the existing
retry-on-EPERM contract depends on it); only the long-running build
step is streamed + idle-killed.
"""
web_dir, _ = _make_web_dir(tmp_path)
install_cp = __import__("subprocess").CompletedProcess([], 0, stdout="", stderr="")
build_cp = __import__("subprocess").CompletedProcess([], 0, stdout="", stderr="")
with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \
patch("hermes_cli.main.subprocess.run", return_value=install_cp), \
patch("hermes_cli.main._run_with_idle_timeout", return_value=build_cp) as mock_idle:
result = _build_web_ui(web_dir)
assert result is True
# Build was invoked through the idle-timeout helper, not subprocess.run.
mock_idle.assert_called_once()
args, kwargs = mock_idle.call_args
# Positional: [npm, "run", "build"]; cwd passed as kwarg.
assert args[0] == ["/usr/bin/npm", "run", "build"]
assert kwargs["cwd"] == web_dir
class TestBuildWebUIRetryAndStaleFallback:
"""Coverage for the retry + stale-dist fallback added in #23824 / issue #23817."""
def test_retries_build_once_on_failure(self, tmp_path):
web_dir, _ = _make_web_dir(tmp_path)
Subprocess = __import__("subprocess")
install_ok = Subprocess.CompletedProcess([], 0, stdout="", stderr="")
# build attempt 1: fail; build attempt 2: success.
build_fail = Subprocess.CompletedProcess([], 1, stdout="EPERM", stderr="")
build_ok = Subprocess.CompletedProcess([], 0, stdout="", stderr="")
with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \
patch("hermes_cli.main._time.sleep") as mock_sleep, \
patch("hermes_cli.main.subprocess.run", return_value=install_ok), \
patch("hermes_cli.main._run_with_idle_timeout",
side_effect=[build_fail, build_ok]) as mock_idle:
result = _build_web_ui(web_dir)
assert result is True
assert mock_idle.call_count == 2 # build + retry
mock_sleep.assert_called_once_with(3)
def test_falls_back_to_stale_dist_when_retry_also_fails(self, tmp_path, capsys):
web_dir, dist_dir = _make_web_dir(tmp_path)
# Stale dist exists but is older than source
_touch(dist_dir / "index.html", offset=-100)
_touch(web_dir / "src" / "App.tsx") # newer source -> build_needed=True
Subprocess = __import__("subprocess")
install_ok = Subprocess.CompletedProcess([], 0, stdout="", stderr="")
build_fail = Subprocess.CompletedProcess([], 1, stdout="vite ENOMEM", stderr="")
with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \
patch("hermes_cli.main._time.sleep"), \
patch("hermes_cli.main.subprocess.run", return_value=install_ok), \
patch("hermes_cli.main._run_with_idle_timeout",
side_effect=[build_fail, build_fail]):
result = _build_web_ui(web_dir, fatal=True)
# MUST return True (serve stale) — issue #23817 — even with fatal=True,
# because cmd_dashboard passes fatal=True and is the primary caller.
assert result is True
out = capsys.readouterr().out
assert "serving stale dist as fallback" in out
assert "vite ENOMEM" in out # combined output surfaced to user
def test_hard_fails_when_no_dist_to_fall_back_to(self, tmp_path, capsys):
web_dir, _ = _make_web_dir(tmp_path)
Subprocess = __import__("subprocess")
install_ok = Subprocess.CompletedProcess([], 0, stdout="", stderr="")
build_fail = Subprocess.CompletedProcess([], 1, stdout="vite ENOMEM", stderr="")
with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \
patch("hermes_cli.main._time.sleep"), \
patch("hermes_cli.main.subprocess.run", return_value=install_ok), \
patch("hermes_cli.main._run_with_idle_timeout",
side_effect=[build_fail, build_fail]):
result = _build_web_ui(web_dir, fatal=True)
assert result is False
out = capsys.readouterr().out
assert "Web UI build failed" in out
assert "vite ENOMEM" in out
assert "Run manually" in out