From a7df9f0245a3eff6e42345f39549764d42bb2dc2 Mon Sep 17 00:00:00 2001 From: Dennis Soong <275702+dso2ng@users.noreply.github.com> Date: Mon, 20 Apr 2026 17:09:01 +0800 Subject: [PATCH] fix: tolerate non-utf8 filenames in file discovery --- agent/context_references.py | 9 ++++++-- hermes_cli/commands.py | 17 +++++++++++--- tests/agent/test_context_references.py | 29 ++++++++++++++++++++++++ tests/hermes_cli/test_path_completion.py | 25 ++++++++++++++++++++ 4 files changed, 75 insertions(+), 5 deletions(-) diff --git a/agent/context_references.py b/agent/context_references.py index 50a33a1d7..8d7aed648 100644 --- a/agent/context_references.py +++ b/agent/context_references.py @@ -474,20 +474,25 @@ def _iter_visible_entries(path: Path, cwd: Path, limit: int) -> list[Path]: return output +def _decode_fs_lines(data: bytes) -> list[str]: + """Decode subprocess file listings using filesystem semantics.""" + return [os.fsdecode(line) for line in data.splitlines() if line] + + def _rg_files(path: Path, cwd: Path, limit: int) -> list[Path] | None: try: result = subprocess.run( ["rg", "--files", str(path.relative_to(cwd))], cwd=cwd, capture_output=True, - text=True, + text=False, timeout=10, ) except (FileNotFoundError, OSError, subprocess.TimeoutExpired): return None if result.returncode != 0: return None - files = [Path(line.strip()) for line in result.stdout.splitlines() if line.strip()] + files = [Path(line) for line in _decode_fs_lines(result.stdout)] return files[:limit] diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py index f753d6f3a..a7551698c 100644 --- a/hermes_cli/commands.py +++ b/hermes_cli/commands.py @@ -991,11 +991,14 @@ class SlashCommandCompleter(Completer): continue try: proc = subprocess.run( - cmd, capture_output=True, text=True, timeout=2, + cmd, + capture_output=True, + text=False, + timeout=2, cwd=cwd, ) - if proc.returncode == 0 and proc.stdout.strip(): - raw = proc.stdout.strip().split("\n") + raw = _decode_fs_lines(proc.stdout) + if proc.returncode == 0 and raw: # Store relative paths for p in raw[:5000]: rel = os.path.relpath(p, cwd) if os.path.isabs(p) else p @@ -1324,6 +1327,14 @@ class SlashCommandAutoSuggest(AutoSuggest): return None +def _decode_fs_lines(data: bytes) -> list[str]: + """Decode subprocess file listings using filesystem semantics. + + This preserves non-UTF-8 filenames via surrogateescape instead of raising. + """ + return [os.fsdecode(line) for line in data.splitlines() if line] + + def _file_size_label(path: str) -> str: """Return a compact human-readable file size, or '' on error.""" try: diff --git a/tests/agent/test_context_references.py b/tests/agent/test_context_references.py index 02456d064..6740f590a 100644 --- a/tests/agent/test_context_references.py +++ b/tests/agent/test_context_references.py @@ -1,6 +1,8 @@ from __future__ import annotations import asyncio +import os +import shutil import subprocess from pathlib import Path from unittest.mock import patch @@ -207,6 +209,33 @@ def test_binary_and_missing_files_become_warnings(sample_repo: Path): assert "not found" in result.message.lower() +@pytest.mark.skipif(shutil.which("rg") is None, reason="rg is required for folder listing tests") +def test_folder_listing_handles_non_utf8_filenames(tmp_path: Path): + from agent.context_references import preprocess_context_references + + workspace = tmp_path / "repo" + workspace.mkdir() + (workspace / "visible.txt").write_text("hello\n", encoding="utf-8") + + bad_path = os.fsencode(workspace) + b"/bad-\xff.txt" + fd = os.open(bad_path, os.O_WRONLY | os.O_CREAT, 0o644) + try: + os.write(fd, b"x") + finally: + os.close(fd) + + result = preprocess_context_references( + "Check @folder:.", + cwd=workspace, + context_length=100_000, + ) + + assert result.expanded + assert "visible.txt" in result.message + assert "bad-" in result.message + assert not result.blocked + + def test_soft_budget_warns_and_hard_budget_refuses(sample_repo: Path): from agent.context_references import preprocess_context_references diff --git a/tests/hermes_cli/test_path_completion.py b/tests/hermes_cli/test_path_completion.py index b41a36e2e..d7401c59e 100644 --- a/tests/hermes_cli/test_path_completion.py +++ b/tests/hermes_cli/test_path_completion.py @@ -1,6 +1,7 @@ """Tests for file path autocomplete in the CLI completer.""" import os +import shutil from unittest.mock import MagicMock import pytest @@ -163,6 +164,30 @@ class TestIntegration: # /etc/hosts should exist on Linux assert any("host" in n.lower() for n in names) + @pytest.mark.skipif(shutil.which("rg") is None, reason="rg is required for project file cache tests") + def test_bare_at_completion_handles_non_utf8_filenames(self, completer, tmp_path): + good = tmp_path / "good.txt" + good.write_text("ok", encoding="utf-8") + + bad_path = os.fsencode(tmp_path) + b"/bad-\xff.txt" + fd = os.open(bad_path, os.O_WRONLY | os.O_CREAT, 0o644) + try: + os.write(fd, b"x") + finally: + os.close(fd) + + old_cwd = os.getcwd() + os.chdir(tmp_path) + try: + doc = Document("@", cursor_position=1) + event = MagicMock() + completions = list(completer.get_completions(doc, event)) + texts = [completion.text for completion in completions] + assert "@file:good.txt" in texts + assert any(text.startswith("@file:bad-") for text in texts) + finally: + os.chdir(old_cwd) + class TestFileSizeLabel: def test_bytes(self, tmp_path):