hermes-agent/tests/hermes_cli/test_ensure_utf8_locale.py
kshitij 2f19512341
fix(cli): repair non-UTF-8 stdout/stderr on all platforms, not just Windows (#43439)
`hermes setup` (and other banner-printing commands) crash with an unhandled
UnicodeEncodeError on Linux hosts whose locale selects a non-UTF-8 codec —
e.g. a fresh Raspberry Pi / minimal Debian with a latin-1 or C/POSIX locale.
The setup wizard prints box-drawing characters (┌│├└─) and the ⚕ glyph before
any stream repair runs, so the command dies before it can start.

The existing _ensure_utf8() shim already knew how to re-wrap the standard
streams as UTF-8, but it returned early on `sys.platform != "win32"`, so the
identical crash class on Linux was never covered.

- Drop the win32 gate: repair any stdout/stderr whose encoding is not UTF-8.
- Prefer TextIOWrapper.reconfigure() so the stream object is fixed in place
  (cached sys.stdout references keep working); fall back to reopening the fd
  with closefd=False (the CPython-recommended safe variant).
- Use errors="replace" — matching the sibling hermes_cli/stdio.py shim — so a
  stray un-encodable byte degrades gracefully instead of crashing.
- Only set the PYTHONUTF8/PYTHONIOENCODING child-process hints when a repair
  actually happened, so a healthy UTF-8 host sees zero footprint (no stream
  swap, no env mutation).

This is intentionally the earliest, platform-agnostic guard, running at import
time before any banner prints. hermes_cli/stdio.py::configure_windows_stdio()
still runs later from the entry points for the Windows-only extras (console
code-page flip, EDITOR default, PATH augmentation); it early-returns on
non-Windows and its stream reconfigure is an idempotent no-op once we've
already repaired the streams here.

Add regression tests covering latin-1 and ascii/POSIX streams, the reconfigure
fallback, already-UTF-8 no-op (identity preserved + no env mutation), the
repair-sets-env and respects-explicit-env contracts, and hostile/None streams.
2026-06-10 02:21:00 -07:00

179 lines
6.3 KiB
Python

"""Regression tests for hermes_cli._ensure_utf8().
Covers the crash class where the setup wizard (and other banner-printing
commands) emit box-drawing characters and the ⚕ glyph, which raise
UnicodeEncodeError when stdout/stderr are bound to a non-UTF-8 codec.
Historically the repair was gated on ``sys.platform == "win32"`` and only
caught the Windows cp1252 case. Linux hosts with a latin-1 / C / POSIX locale
(common on minimal Debian installs and Raspberry Pi) hit the identical crash
in ``hermes setup`` because the repair returned early. See the Raspberry Pi
report: latin-1 locale → UnicodeEncodeError before the wizard could start.
"""
import io
import os
import sys
import hermes_cli
# The exact glyphs the setup wizard / banners print (setup.py ~line 2962+).
_BANNER = "┌─────┐\n│ ⚕ Hermes │\n└─────┘"
class _FakeStream:
"""Minimal text stream backed by an in-memory byte buffer with a codec.
Mirrors how CPython binds sys.stdout to the locale encoding: writes that
can't be encoded raise UnicodeEncodeError, just like a real latin-1 TTY.
"""
def __init__(self, encoding, *, supports_reconfigure=True):
self.encoding = encoding
self._supports_reconfigure = supports_reconfigure
self.errors = "strict"
self._buf = io.BytesIO()
def write(self, s):
self._buf.write(s.encode(self.encoding, self.errors))
return len(s)
def flush(self):
pass
def reconfigure(self, *, encoding=None, errors=None):
if not self._supports_reconfigure:
raise AttributeError("reconfigure")
if encoding is not None:
self.encoding = encoding
if errors is not None:
self.errors = errors
def getvalue(self):
return self._buf.getvalue()
def _run_with_streams(monkeypatch, out, err):
monkeypatch.setattr(sys, "stdout", out, raising=False)
monkeypatch.setattr(sys, "stderr", err, raising=False)
hermes_cli._ensure_utf8()
def test_latin1_stdout_is_repaired_to_utf8(monkeypatch):
"""A latin-1 stdout (the Raspberry Pi case) becomes UTF-8 capable."""
out = _FakeStream("latin-1")
err = _FakeStream("latin-1")
# Sanity: before the fix, the banner cannot be encoded.
try:
out.write(_BANNER)
pre_fix_crashes = False
except UnicodeEncodeError:
pre_fix_crashes = True
assert pre_fix_crashes, "fixture should reproduce the original crash"
out = _FakeStream("latin-1")
err = _FakeStream("latin-1")
_run_with_streams(monkeypatch, out, err)
assert sys.stdout.encoding.lower().replace("-", "") == "utf8"
assert sys.stderr.encoding.lower().replace("-", "") == "utf8"
# The banner now encodes without raising.
sys.stdout.write(_BANNER)
assert "".encode("utf-8") in sys.stdout.getvalue()
def test_ascii_posix_locale_is_repaired(monkeypatch):
"""C/POSIX locale resolves to ascii stdout — also must be repaired."""
out = _FakeStream("ascii")
err = _FakeStream("ascii")
_run_with_streams(monkeypatch, out, err)
assert sys.stdout.encoding.lower().replace("-", "") == "utf8"
sys.stdout.write(_BANNER) # no raise
def test_utf8_stream_left_untouched(monkeypatch):
"""Already-UTF-8 streams are a no-op: object identity preserved AND the
process environment is left untouched (no PYTHONUTF8/PYTHONIOENCODING
burned in on a healthy UTF-8 host)."""
out = _FakeStream("utf-8")
err = _FakeStream("utf-8")
sentinel_out, sentinel_err = out, err
monkeypatch.delenv("PYTHONUTF8", raising=False)
monkeypatch.delenv("PYTHONIOENCODING", raising=False)
_run_with_streams(monkeypatch, out, err)
assert sys.stdout is sentinel_out
assert sys.stderr is sentinel_err
# Healthy UTF-8 host: no environment mutation (minimal footprint).
assert "PYTHONUTF8" not in os.environ
assert "PYTHONIOENCODING" not in os.environ
def test_repair_sets_child_process_env(monkeypatch):
"""When a real repair happens, child-process UTF-8 hints are set."""
monkeypatch.delenv("PYTHONUTF8", raising=False)
monkeypatch.delenv("PYTHONIOENCODING", raising=False)
_run_with_streams(monkeypatch, _FakeStream("latin-1"), _FakeStream("latin-1"))
assert os.environ.get("PYTHONUTF8") == "1"
assert os.environ.get("PYTHONIOENCODING") == "utf-8"
def test_repair_does_not_override_explicit_env(monkeypatch):
"""A user's explicit PYTHONIOENCODING is respected (setdefault, not set)."""
monkeypatch.setenv("PYTHONIOENCODING", "utf-16")
monkeypatch.delenv("PYTHONUTF8", raising=False)
_run_with_streams(monkeypatch, _FakeStream("latin-1"), _FakeStream("latin-1"))
assert os.environ["PYTHONIOENCODING"] == "utf-16"
def test_fallback_when_reconfigure_unavailable(monkeypatch, tmp_path):
"""Streams without reconfigure() fall back to reopening the fd as UTF-8."""
real_path = tmp_path / "out.txt"
fh = open(real_path, "w", encoding="latin-1")
class _NoReconfigure:
"""latin-1 stream exposing a real fileno() but no reconfigure()."""
encoding = "latin-1"
def fileno(self):
return fh.fileno()
stream = _NoReconfigure()
monkeypatch.setattr(sys, "stdout", stream, raising=False)
monkeypatch.setattr(sys, "stderr", stream, raising=False)
hermes_cli._ensure_utf8()
# Replaced with a new UTF-8 stream object (not reconfigured in place).
assert sys.stdout is not stream
assert sys.stdout.encoding.lower().replace("-", "") == "utf8"
sys.stdout.write(_BANNER)
sys.stdout.flush()
fh.close()
assert "".encode("utf-8") in real_path.read_bytes()
def test_broken_stream_does_not_raise(monkeypatch):
"""A stream whose repair raises must be swallowed, never crash import."""
class _Hostile:
encoding = "latin-1"
def reconfigure(self, *a, **k):
raise OSError("nope")
def fileno(self):
raise OSError("no fd")
monkeypatch.setattr(sys, "stdout", _Hostile(), raising=False)
monkeypatch.setattr(sys, "stderr", _Hostile(), raising=False)
# Must not propagate.
hermes_cli._ensure_utf8()
def test_none_streams_do_not_raise(monkeypatch):
"""pythonw / detached streams (sys.stdout is None) must be tolerated."""
monkeypatch.setattr(sys, "stdout", None, raising=False)
monkeypatch.setattr(sys, "stderr", None, raising=False)
hermes_cli._ensure_utf8()