fix: tolerate non-utf8 filenames in file discovery

This commit is contained in:
Dennis Soong 2026-04-20 17:09:01 +08:00
parent d990fa52ed
commit a7df9f0245
4 changed files with 75 additions and 5 deletions

View file

@ -474,20 +474,25 @@ def _iter_visible_entries(path: Path, cwd: Path, limit: int) -> list[Path]:
return output
def _decode_fs_lines(data: bytes) -> list[str]:
"""Decode subprocess file listings using filesystem semantics."""
return [os.fsdecode(line) for line in data.splitlines() if line]
def _rg_files(path: Path, cwd: Path, limit: int) -> list[Path] | None:
try:
result = subprocess.run(
["rg", "--files", str(path.relative_to(cwd))],
cwd=cwd,
capture_output=True,
text=True,
text=False,
timeout=10,
)
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
return None
if result.returncode != 0:
return None
files = [Path(line.strip()) for line in result.stdout.splitlines() if line.strip()]
files = [Path(line) for line in _decode_fs_lines(result.stdout)]
return files[:limit]

View file

@ -991,11 +991,14 @@ class SlashCommandCompleter(Completer):
continue
try:
proc = subprocess.run(
cmd, capture_output=True, text=True, timeout=2,
cmd,
capture_output=True,
text=False,
timeout=2,
cwd=cwd,
)
if proc.returncode == 0 and proc.stdout.strip():
raw = proc.stdout.strip().split("\n")
raw = _decode_fs_lines(proc.stdout)
if proc.returncode == 0 and raw:
# Store relative paths
for p in raw[:5000]:
rel = os.path.relpath(p, cwd) if os.path.isabs(p) else p
@ -1324,6 +1327,14 @@ class SlashCommandAutoSuggest(AutoSuggest):
return None
def _decode_fs_lines(data: bytes) -> list[str]:
"""Decode subprocess file listings using filesystem semantics.
This preserves non-UTF-8 filenames via surrogateescape instead of raising.
"""
return [os.fsdecode(line) for line in data.splitlines() if line]
def _file_size_label(path: str) -> str:
"""Return a compact human-readable file size, or '' on error."""
try:

View file

@ -1,6 +1,8 @@
from __future__ import annotations
import asyncio
import os
import shutil
import subprocess
from pathlib import Path
from unittest.mock import patch
@ -207,6 +209,33 @@ def test_binary_and_missing_files_become_warnings(sample_repo: Path):
assert "not found" in result.message.lower()
@pytest.mark.skipif(shutil.which("rg") is None, reason="rg is required for folder listing tests")
def test_folder_listing_handles_non_utf8_filenames(tmp_path: Path):
from agent.context_references import preprocess_context_references
workspace = tmp_path / "repo"
workspace.mkdir()
(workspace / "visible.txt").write_text("hello\n", encoding="utf-8")
bad_path = os.fsencode(workspace) + b"/bad-\xff.txt"
fd = os.open(bad_path, os.O_WRONLY | os.O_CREAT, 0o644)
try:
os.write(fd, b"x")
finally:
os.close(fd)
result = preprocess_context_references(
"Check @folder:.",
cwd=workspace,
context_length=100_000,
)
assert result.expanded
assert "visible.txt" in result.message
assert "bad-" in result.message
assert not result.blocked
def test_soft_budget_warns_and_hard_budget_refuses(sample_repo: Path):
from agent.context_references import preprocess_context_references

View file

@ -1,6 +1,7 @@
"""Tests for file path autocomplete in the CLI completer."""
import os
import shutil
from unittest.mock import MagicMock
import pytest
@ -163,6 +164,30 @@ class TestIntegration:
# /etc/hosts should exist on Linux
assert any("host" in n.lower() for n in names)
@pytest.mark.skipif(shutil.which("rg") is None, reason="rg is required for project file cache tests")
def test_bare_at_completion_handles_non_utf8_filenames(self, completer, tmp_path):
good = tmp_path / "good.txt"
good.write_text("ok", encoding="utf-8")
bad_path = os.fsencode(tmp_path) + b"/bad-\xff.txt"
fd = os.open(bad_path, os.O_WRONLY | os.O_CREAT, 0o644)
try:
os.write(fd, b"x")
finally:
os.close(fd)
old_cwd = os.getcwd()
os.chdir(tmp_path)
try:
doc = Document("@", cursor_position=1)
event = MagicMock()
completions = list(completer.get_completions(doc, event))
texts = [completion.text for completion in completions]
assert "@file:good.txt" in texts
assert any(text.startswith("@file:bad-") for text in texts)
finally:
os.chdir(old_cwd)
class TestFileSizeLabel:
def test_bytes(self, tmp_path):