mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(environments): use incremental UTF-8 decoder in select-based drain
The first draft of the fix called `chunk.decode("utf-8")` directly on
each 4096-byte `os.read()` result, which corrupts output whenever a
multi-byte UTF-8 character straddles a read boundary:
* `UnicodeDecodeError` fires on the valid-but-truncated byte sequence.
* The except handler clears ALL previously-decoded output and replaces
the whole buffer with `[binary output detected ...]`.
Empirically: 10000 '日' chars (30001 bytes) through the wrapper loses
all 10000 characters on the first draft; the baseline TextIOWrapper
drain (which uses `encoding='utf-8', errors='replace'` on Popen)
preserves them all. This regression affects any command emitting
non-ASCII output larger than one chunk — CJK/Arabic/emoji in
`npm install`, `pip install`, `docker logs`, `kubectl logs`, etc.
Fix: swap to `codecs.getincrementaldecoder('utf-8')(errors='replace')`,
which buffers partial multi-byte sequences across chunks and substitutes
U+FFFD for genuinely invalid bytes. Flush on drain exit via
`decoder.decode(b'', final=True)` to emit any trailing replacement
character for a dangling partial sequence.
Adds two regression tests:
* test_utf8_multibyte_across_read_boundary — 10000 U+65E5 chars,
verifies count round-trips and no fallback fires.
* test_invalid_utf8_uses_replacement_not_fallback — deliberate
\xff\xfe between valid ASCII, verifies surrounding text survives.
This commit is contained in:
parent
0a02fbd842
commit
f336ae3d7d
2 changed files with 84 additions and 26 deletions
|
|
@ -109,3 +109,46 @@ class TestBackgroundChildDoesNotHang:
|
||||||
assert "日本語" in result["output"]
|
assert "日本語" in result["output"]
|
||||||
assert "café" in result["output"]
|
assert "café" in result["output"]
|
||||||
assert "résumé" in result["output"]
|
assert "résumé" in result["output"]
|
||||||
|
|
||||||
|
def test_utf8_multibyte_across_read_boundary(self, local_env):
|
||||||
|
"""Multibyte UTF-8 characters straddling a 4096-byte ``os.read()`` boundary
|
||||||
|
must be decoded correctly via the incremental decoder — not lost to a
|
||||||
|
``UnicodeDecodeError`` fallback. Regression for a bug in the first draft
|
||||||
|
of the fix where a strict ``bytes.decode('utf-8')`` on each raw chunk
|
||||||
|
wiped the entire buffer as soon as any chunk split a multi-byte char.
|
||||||
|
"""
|
||||||
|
# 10000 "日" chars = 30000 bytes — guaranteed to cross multiple 4096
|
||||||
|
# read boundaries, and most boundaries will land in the middle of the
|
||||||
|
# 3-byte UTF-8 encoding of U+65E5.
|
||||||
|
cmd = (
|
||||||
|
'python3 -c \'import sys; '
|
||||||
|
'sys.stdout.buffer.write(chr(0x65e5).encode("utf-8") * 10000); '
|
||||||
|
'sys.stdout.buffer.write(b"\\n")\''
|
||||||
|
)
|
||||||
|
result = local_env.execute(cmd, timeout=10)
|
||||||
|
assert result["returncode"] == 0
|
||||||
|
# All 10000 characters must survive the round-trip
|
||||||
|
assert result["output"].count("\u65e5") == 10000, (
|
||||||
|
f"lost multibyte chars across read boundaries: got "
|
||||||
|
f"{result['output'].count(chr(0x65e5))} / 10000"
|
||||||
|
)
|
||||||
|
# And the "[binary output detected ...]" fallback must NOT fire
|
||||||
|
assert "binary output detected" not in result["output"]
|
||||||
|
|
||||||
|
def test_invalid_utf8_uses_replacement_not_fallback(self, local_env):
|
||||||
|
"""Truly invalid byte sequences must be substituted with U+FFFD (matching
|
||||||
|
the pre-fix ``errors='replace'`` behaviour of the old ``TextIOWrapper``
|
||||||
|
drain), not clobber the entire buffer with a fallback placeholder.
|
||||||
|
"""
|
||||||
|
# Write a deliberate invalid UTF-8 lead byte sandwiched between valid ASCII
|
||||||
|
cmd = (
|
||||||
|
'python3 -c \'import sys; '
|
||||||
|
'sys.stdout.buffer.write(b"before "); '
|
||||||
|
'sys.stdout.buffer.write(b"\\xff\\xfe"); '
|
||||||
|
'sys.stdout.buffer.write(b" after\\n")\''
|
||||||
|
)
|
||||||
|
result = local_env.execute(cmd, timeout=5)
|
||||||
|
assert result["returncode"] == 0
|
||||||
|
assert "before" in result["output"]
|
||||||
|
assert "after" in result["output"]
|
||||||
|
assert "binary output detected" not in result["output"]
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ re-sourced before each command. CWD persists via in-band stdout markers (remote)
|
||||||
or a temp file (local).
|
or a temp file (local).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import codecs
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|
@ -453,37 +454,51 @@ class BaseEnvironment(ABC):
|
||||||
# shortly after ``bash`` exits even if the pipe hasn't EOF'd yet.
|
# shortly after ``bash`` exits even if the pipe hasn't EOF'd yet.
|
||||||
# Any output the grandchild writes after that point goes to an
|
# Any output the grandchild writes after that point goes to an
|
||||||
# orphaned pipe (harmless — the kernel reaps it when our end closes).
|
# orphaned pipe (harmless — the kernel reaps it when our end closes).
|
||||||
|
#
|
||||||
|
# Decoding: we ``os.read()`` raw bytes in fixed-size chunks (4096)
|
||||||
|
# so a single multibyte UTF-8 character can split across reads. An
|
||||||
|
# incremental decoder buffers partial sequences across chunks, and
|
||||||
|
# ``errors="replace"`` mirrors the baseline ``TextIOWrapper`` (which
|
||||||
|
# was constructed with ``encoding="utf-8", errors="replace"`` on
|
||||||
|
# ``Popen``) so binary or mis-encoded output is preserved with
|
||||||
|
# U+FFFD substitution rather than clobbering the whole buffer.
|
||||||
|
decoder = codecs.getincrementaldecoder("utf-8")(errors="replace")
|
||||||
|
|
||||||
def _drain():
|
def _drain():
|
||||||
fd = proc.stdout.fileno()
|
fd = proc.stdout.fileno()
|
||||||
idle_after_exit = 0
|
idle_after_exit = 0
|
||||||
while True:
|
try:
|
||||||
try:
|
while True:
|
||||||
ready, _, _ = select.select([fd], [], [], 0.1)
|
|
||||||
except (ValueError, OSError):
|
|
||||||
break # fd already closed
|
|
||||||
if ready:
|
|
||||||
try:
|
try:
|
||||||
chunk = os.read(fd, 4096)
|
ready, _, _ = select.select([fd], [], [], 0.1)
|
||||||
except (ValueError, OSError):
|
except (ValueError, OSError):
|
||||||
break
|
break # fd already closed
|
||||||
if not chunk:
|
if ready:
|
||||||
break # true EOF — all writers closed
|
try:
|
||||||
try:
|
chunk = os.read(fd, 4096)
|
||||||
output_chunks.append(chunk.decode("utf-8"))
|
except (ValueError, OSError):
|
||||||
except UnicodeDecodeError:
|
break
|
||||||
output_chunks.clear()
|
if not chunk:
|
||||||
output_chunks.append(
|
break # true EOF — all writers closed
|
||||||
"[binary output detected — raw bytes not displayable]"
|
output_chunks.append(decoder.decode(chunk))
|
||||||
)
|
idle_after_exit = 0
|
||||||
break
|
elif proc.poll() is not None:
|
||||||
idle_after_exit = 0
|
# bash is gone and the pipe was idle for ~100ms. Give
|
||||||
elif proc.poll() is not None:
|
# it two more cycles to catch any buffered tail, then
|
||||||
# bash is gone and the pipe was idle for ~100ms. Give
|
# stop — otherwise we wait forever on a grandchild pipe.
|
||||||
# it two more cycles to catch any buffered tail, then
|
idle_after_exit += 1
|
||||||
# stop — otherwise we wait forever on a grandchild pipe.
|
if idle_after_exit >= 3:
|
||||||
idle_after_exit += 1
|
break
|
||||||
if idle_after_exit >= 3:
|
finally:
|
||||||
break
|
# Flush any bytes buffered mid-sequence. With ``errors="replace"``
|
||||||
|
# this emits U+FFFD for any final incomplete sequence rather than
|
||||||
|
# raising.
|
||||||
|
try:
|
||||||
|
tail = decoder.decode(b"", final=True)
|
||||||
|
if tail:
|
||||||
|
output_chunks.append(tail)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
drain_thread = threading.Thread(target=_drain, daemon=True)
|
drain_thread = threading.Thread(target=_drain, daemon=True)
|
||||||
drain_thread.start()
|
drain_thread.start()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue