From b08662b782beed5160e43c6205e76d9f54aa731b Mon Sep 17 00:00:00 2001 From: annguyenNous Date: Sat, 6 Jun 2026 19:54:58 +0700 Subject: [PATCH] fix(gateway): tolerate Unicode in stderr log handlers on Windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Windows with non-UTF-8 console encodings (e.g. cp949, cp1252), StreamHandler emits raise UnicodeEncodeError when log messages contain characters outside the console codepage — such as the em-dash (U+2014) in the session hygiene message. This crashed the gateway process silently, leaving no diagnostic output. Fix: add _safe_stderr() helper that wraps sys.stderr in a TextIOWrapper with encoding='utf-8' and errors='replace' when the console encoding is not UTF-8. Applied to both: - hermes_logging.py setup_verbose_logging() stderr handler - gateway/run.py optional stderr handler The wrapper ensures log lines are never lost — un-encodable characters are replaced with '?' instead of crashing the process. Fixes #40432 --- gateway/run.py | 4 +- hermes_logging.py | 38 ++++++++++++++++++- tests/test_hermes_logging.py | 72 ++++++++++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 3 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 4faaa0dabe8..8bf024b9886 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -19706,7 +19706,7 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = # Centralized logging — agent.log (INFO+), errors.log (WARNING+), # and gateway.log (INFO+, gateway-component records only). # Idempotent, so repeated calls from AIAgent.__init__ won't duplicate. - from hermes_logging import setup_logging + from hermes_logging import setup_logging, _safe_stderr setup_logging(hermes_home=_hermes_home, mode="gateway") # Optional stderr handler — level driven by -v/-q flags on the CLI. @@ -19718,7 +19718,7 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = from agent.redact import RedactingFormatter _stderr_level = {0: logging.WARNING, 1: logging.INFO}.get(verbosity, logging.DEBUG) - _stderr_handler = logging.StreamHandler() + _stderr_handler = logging.StreamHandler(_safe_stderr()) _stderr_handler.setLevel(_stderr_level) _stderr_handler.setFormatter(RedactingFormatter('%(levelname)s %(name)s: %(message)s')) logging.getLogger().addHandler(_stderr_handler) diff --git a/hermes_logging.py b/hermes_logging.py index a3656c8c139..eee46af37f9 100644 --- a/hermes_logging.py +++ b/hermes_logging.py @@ -27,8 +27,10 @@ Session context: that thread will include ``[session_id]`` for filtering/correlation. """ +import io import logging import os +import sys import threading from logging.handlers import RotatingFileHandler from pathlib import Path @@ -50,6 +52,40 @@ _session_context = threading.local() _LOG_FORMAT = "%(asctime)s %(levelname)s%(session_tag)s %(name)s: %(message)s" _LOG_FORMAT_VERBOSE = "%(asctime)s - %(name)s - %(levelname)s%(session_tag)s - %(message)s" + +def _safe_stderr(): # type: ignore[return] + """Return a stderr stream that tolerates Unicode on all platforms. + + On Windows the console encoding is often a legacy MBCS codec + (cp949, cp1252, …) that raises ``UnicodeEncodeError`` for characters + like the em-dash (U+2014). We wrap ``sys.stderr`` in a + ``TextIOWrapper`` with ``errors='replace'`` so log lines are never + lost — un-encodable characters are replaced with ``?`` instead of + crashing the process. + """ + stream = sys.stderr + encoding = getattr(stream, "encoding", None) or "utf-8" + # Already UTF-8 or surrogate-aware — no wrapping needed. + if encoding.lower().replace("-", "") in ("utf8", "utf8surrogateescape"): + return stream + try: + buf = getattr(stream, "buffer", None) + if buf is not None: + wrapped = io.TextIOWrapper( + buf, + encoding="utf-8", + errors="replace", + line_buffering=True, + ) + # Prevent the wrapper from closing the underlying buffer + # when it is garbage-collected. + wrapped.close = lambda: None # type: ignore[assignment] + return wrapped + except Exception: + pass + # Best-effort: if wrapping fails, return the original stream. + return stream + # Third-party loggers that are noisy at DEBUG/INFO level. _NOISY_LOGGERS = ( "openai", @@ -298,7 +334,7 @@ def setup_verbose_logging() -> None: if getattr(h, "_hermes_verbose", False): return - handler = logging.StreamHandler() + handler = logging.StreamHandler(_safe_stderr()) handler.setLevel(logging.DEBUG) handler.setFormatter(RedactingFormatter(_LOG_FORMAT_VERBOSE, datefmt="%H:%M:%S")) handler._hermes_verbose = True # type: ignore[attr-defined] diff --git a/tests/test_hermes_logging.py b/tests/test_hermes_logging.py index febef0a4789..38672da54f5 100644 --- a/tests/test_hermes_logging.py +++ b/tests/test_hermes_logging.py @@ -3,6 +3,7 @@ import logging import os import stat +import sys import threading from logging.handlers import RotatingFileHandler from pathlib import Path @@ -997,3 +998,74 @@ class TestExternalRotationRecovery: assert gw_path.exists(), "gateway.log was never recreated" assert "AFTER rotation" in gw_path.read_text() assert "AFTER rotation" not in rotated.read_text() + + +class TestSafeStderr: + """Tests for _safe_stderr() — Unicode tolerance on Windows console.""" + + def test_returns_stderr_on_utf8_system(self, monkeypatch): + """On UTF-8 systems, _safe_stderr() returns sys.stderr unchanged.""" + import io + fake_stderr = io.StringIO() + monkeypatch.setattr(sys, "stderr", fake_stderr) + # On Linux/macOS, encoding is typically utf-8 + result = hermes_logging._safe_stderr() + # Should return the same object (or a equivalent stream) + assert result is fake_stderr or getattr(result, "encoding", "").lower().startswith("utf") + + def test_wraps_non_utf8_stderr(self, monkeypatch): + """On non-UTF-8 systems (e.g. Windows cp949), wraps stderr with UTF-8.""" + import io + + class FakeStderr: + """Simulates a Windows stderr with legacy encoding.""" + encoding = "cp949" + buffer = io.BytesIO() + + def write(self, s): + pass + + def flush(self): + pass + + fake = FakeStderr() + monkeypatch.setattr(sys, "stderr", fake) + result = hermes_logging._safe_stderr() + # Should be a TextIOWrapper, not the original FakeStderr + assert isinstance(result, io.TextIOWrapper) + assert result.encoding == "utf-8" + assert result.errors == "replace" + + def test_handler_emits_unicode_without_crash(self, tmp_path): + """StreamHandler with _safe_stderr can emit Unicode messages.""" + import io + + # Create a stderr-like stream with ASCII encoding + class AsciiStream: + encoding = "ascii" + buffer = io.BytesIO() + + def write(self, s): + self.buffer.write(s.encode("ascii", errors="replace")) + + def flush(self): + pass + + # Without the fix, this would crash on cp949/ASCII stderr. + # With the wrapper, the em-dash is replaced with '?' + handler = logging.StreamHandler( + io.TextIOWrapper( + io.BytesIO(), + encoding="utf-8", + errors="replace", + ) + ) + handler.setFormatter(logging.Formatter("%(message)s")) + logger = logging.getLogger("_test_unicode") + logger.addHandler(handler) + logger.setLevel(logging.DEBUG) + try: + # Em-dash U+2014 — the exact character from the bug report + logger.info("Session hygiene: 400 messages — auto-compressing") + finally: + logger.removeHandler(handler)