diff --git a/hermes_cli/__init__.py b/hermes_cli/__init__.py index cfa14b066dc..11f2fb6f867 100644 --- a/hermes_cli/__init__.py +++ b/hermes_cli/__init__.py @@ -19,29 +19,74 @@ __release_date__ = "2026.6.5" def _ensure_utf8(): - """Force UTF-8 stdout/stderr on Windows to prevent UnicodeEncodeError. + """Force UTF-8 stdout/stderr to prevent UnicodeEncodeError crashes. - Windows services and terminals default to cp1252, which cannot encode - box-drawing characters used in CLI output. This causes unhandled - UnicodeEncodeError crashes on gateway startup. + Several environments select a legacy, non-UTF-8 encoding for the standard + streams: + + - Windows services and terminals default to cp1252. + - Linux hosts with a latin-1 / C / POSIX locale (common on minimal Debian + installs and Raspberry Pi) select latin-1 or ASCII. + + The CLI prints box-drawing characters (┌│├└─) and the ⚕ glyph in the setup + wizard, doctor, and status banners. Encoding those under a non-UTF-8 codec + raises an unhandled UnicodeEncodeError that crashes the command before it + can even start — e.g. `hermes setup` on a fresh Pi. + + This runs at import time so it protects every CLI subcommand, on any + platform. It re-wraps stdout/stderr as UTF-8 when their encoding is not + already UTF-8, preferring TextIOWrapper.reconfigure() so the existing + stream object is fixed in place (cached `sys.stdout` references keep + working) and falling back to reopening the file descriptor with + closefd=False (the CPython-recommended safe variant). + + No-op when the streams are already UTF-8: a healthy UTF-8 system sees no + stream change and no environment mutation. + + Note: this is intentionally the earliest, platform-agnostic guard. + hermes_cli/stdio.py::configure_windows_stdio() runs later from the entry + points and layers on the Windows-only extras (console code-page flip, + EDITOR default, PATH augmentation); its stream reconfiguration is a + harmless idempotent no-op once we have already repaired the streams here. """ - if sys.platform != "win32": - return - os.environ.setdefault("PYTHONUTF8", "1") - os.environ.setdefault("PYTHONIOENCODING", "utf-8") + repaired = False + for stream_name in ("stdout", "stderr"): stream = getattr(sys, stream_name, None) if stream is None: continue try: - if getattr(stream, "encoding", "").lower().replace("-", "") != "utf8": - new_stream = open( - stream.fileno(), "w", encoding="utf-8", - buffering=1, closefd=False, - ) - setattr(sys, stream_name, new_stream) - except (AttributeError, OSError): + encoding = (getattr(stream, "encoding", "") or "").lower().replace("-", "") + if encoding == "utf8": + continue + + # Preferred: reconfigure the existing TextIOWrapper in place. This + # preserves object identity so any code already holding a reference + # to the old sys.stdout benefits from the repair too. + reconfigure = getattr(stream, "reconfigure", None) + if callable(reconfigure): + reconfigure(encoding="utf-8", errors="replace") + repaired = True + continue + + # Fallback: reopen the underlying file descriptor as UTF-8. Used + # for streams that don't expose reconfigure() (e.g. some wrapped + # or replaced streams). closefd=False keeps the original fd open. + new_stream = open( + stream.fileno(), "w", encoding="utf-8", + errors="replace", buffering=1, closefd=False, + ) + setattr(sys, stream_name, new_stream) + repaired = True + except (AttributeError, OSError, ValueError): pass + # Only nudge child processes toward UTF-8 when we actually detected a + # non-UTF-8 locale. On a healthy UTF-8 host children inherit UTF-8 from the + # locale already, so leave the environment untouched (minimal footprint). + if repaired: + os.environ.setdefault("PYTHONUTF8", "1") + os.environ.setdefault("PYTHONIOENCODING", "utf-8") + _ensure_utf8() diff --git a/tests/hermes_cli/test_ensure_utf8_locale.py b/tests/hermes_cli/test_ensure_utf8_locale.py new file mode 100644 index 00000000000..1e9f5b877dd --- /dev/null +++ b/tests/hermes_cli/test_ensure_utf8_locale.py @@ -0,0 +1,179 @@ +"""Regression tests for hermes_cli._ensure_utf8(). + +Covers the crash class where the setup wizard (and other banner-printing +commands) emit box-drawing characters and the ⚕ glyph, which raise +UnicodeEncodeError when stdout/stderr are bound to a non-UTF-8 codec. + +Historically the repair was gated on ``sys.platform == "win32"`` and only +caught the Windows cp1252 case. Linux hosts with a latin-1 / C / POSIX locale +(common on minimal Debian installs and Raspberry Pi) hit the identical crash +in ``hermes setup`` because the repair returned early. See the Raspberry Pi +report: latin-1 locale → UnicodeEncodeError before the wizard could start. +""" + +import io +import os +import sys + +import hermes_cli + + +# The exact glyphs the setup wizard / banners print (setup.py ~line 2962+). +_BANNER = "┌─────┐\n│ ⚕ Hermes │\n└─────┘" + + +class _FakeStream: + """Minimal text stream backed by an in-memory byte buffer with a codec. + + Mirrors how CPython binds sys.stdout to the locale encoding: writes that + can't be encoded raise UnicodeEncodeError, just like a real latin-1 TTY. + """ + + def __init__(self, encoding, *, supports_reconfigure=True): + self.encoding = encoding + self._supports_reconfigure = supports_reconfigure + self.errors = "strict" + self._buf = io.BytesIO() + + def write(self, s): + self._buf.write(s.encode(self.encoding, self.errors)) + return len(s) + + def flush(self): + pass + + def reconfigure(self, *, encoding=None, errors=None): + if not self._supports_reconfigure: + raise AttributeError("reconfigure") + if encoding is not None: + self.encoding = encoding + if errors is not None: + self.errors = errors + + def getvalue(self): + return self._buf.getvalue() + + +def _run_with_streams(monkeypatch, out, err): + monkeypatch.setattr(sys, "stdout", out, raising=False) + monkeypatch.setattr(sys, "stderr", err, raising=False) + hermes_cli._ensure_utf8() + + +def test_latin1_stdout_is_repaired_to_utf8(monkeypatch): + """A latin-1 stdout (the Raspberry Pi case) becomes UTF-8 capable.""" + out = _FakeStream("latin-1") + err = _FakeStream("latin-1") + + # Sanity: before the fix, the banner cannot be encoded. + try: + out.write(_BANNER) + pre_fix_crashes = False + except UnicodeEncodeError: + pre_fix_crashes = True + assert pre_fix_crashes, "fixture should reproduce the original crash" + + out = _FakeStream("latin-1") + err = _FakeStream("latin-1") + _run_with_streams(monkeypatch, out, err) + + assert sys.stdout.encoding.lower().replace("-", "") == "utf8" + assert sys.stderr.encoding.lower().replace("-", "") == "utf8" + # The banner now encodes without raising. + sys.stdout.write(_BANNER) + assert "⚕".encode("utf-8") in sys.stdout.getvalue() + + +def test_ascii_posix_locale_is_repaired(monkeypatch): + """C/POSIX locale resolves to ascii stdout — also must be repaired.""" + out = _FakeStream("ascii") + err = _FakeStream("ascii") + _run_with_streams(monkeypatch, out, err) + assert sys.stdout.encoding.lower().replace("-", "") == "utf8" + sys.stdout.write(_BANNER) # no raise + + +def test_utf8_stream_left_untouched(monkeypatch): + """Already-UTF-8 streams are a no-op: object identity preserved AND the + process environment is left untouched (no PYTHONUTF8/PYTHONIOENCODING + burned in on a healthy UTF-8 host).""" + out = _FakeStream("utf-8") + err = _FakeStream("utf-8") + sentinel_out, sentinel_err = out, err + monkeypatch.delenv("PYTHONUTF8", raising=False) + monkeypatch.delenv("PYTHONIOENCODING", raising=False) + _run_with_streams(monkeypatch, out, err) + assert sys.stdout is sentinel_out + assert sys.stderr is sentinel_err + # Healthy UTF-8 host: no environment mutation (minimal footprint). + assert "PYTHONUTF8" not in os.environ + assert "PYTHONIOENCODING" not in os.environ + + +def test_repair_sets_child_process_env(monkeypatch): + """When a real repair happens, child-process UTF-8 hints are set.""" + monkeypatch.delenv("PYTHONUTF8", raising=False) + monkeypatch.delenv("PYTHONIOENCODING", raising=False) + _run_with_streams(monkeypatch, _FakeStream("latin-1"), _FakeStream("latin-1")) + assert os.environ.get("PYTHONUTF8") == "1" + assert os.environ.get("PYTHONIOENCODING") == "utf-8" + + +def test_repair_does_not_override_explicit_env(monkeypatch): + """A user's explicit PYTHONIOENCODING is respected (setdefault, not set).""" + monkeypatch.setenv("PYTHONIOENCODING", "utf-16") + monkeypatch.delenv("PYTHONUTF8", raising=False) + _run_with_streams(monkeypatch, _FakeStream("latin-1"), _FakeStream("latin-1")) + assert os.environ["PYTHONIOENCODING"] == "utf-16" + + +def test_fallback_when_reconfigure_unavailable(monkeypatch, tmp_path): + """Streams without reconfigure() fall back to reopening the fd as UTF-8.""" + real_path = tmp_path / "out.txt" + fh = open(real_path, "w", encoding="latin-1") + + class _NoReconfigure: + """latin-1 stream exposing a real fileno() but no reconfigure().""" + + encoding = "latin-1" + + def fileno(self): + return fh.fileno() + + stream = _NoReconfigure() + monkeypatch.setattr(sys, "stdout", stream, raising=False) + monkeypatch.setattr(sys, "stderr", stream, raising=False) + hermes_cli._ensure_utf8() + + # Replaced with a new UTF-8 stream object (not reconfigured in place). + assert sys.stdout is not stream + assert sys.stdout.encoding.lower().replace("-", "") == "utf8" + sys.stdout.write(_BANNER) + sys.stdout.flush() + fh.close() + assert "⚕".encode("utf-8") in real_path.read_bytes() + + +def test_broken_stream_does_not_raise(monkeypatch): + """A stream whose repair raises must be swallowed, never crash import.""" + + class _Hostile: + encoding = "latin-1" + + def reconfigure(self, *a, **k): + raise OSError("nope") + + def fileno(self): + raise OSError("no fd") + + monkeypatch.setattr(sys, "stdout", _Hostile(), raising=False) + monkeypatch.setattr(sys, "stderr", _Hostile(), raising=False) + # Must not propagate. + hermes_cli._ensure_utf8() + + +def test_none_streams_do_not_raise(monkeypatch): + """pythonw / detached streams (sys.stdout is None) must be tolerated.""" + monkeypatch.setattr(sys, "stdout", None, raising=False) + monkeypatch.setattr(sys, "stderr", None, raising=False) + hermes_cli._ensure_utf8()