feat(file_tools): harden read_file with size guard, dedup, and device blocking (#4315)

* feat(file_tools): harden read_file with size guard, dedup, and device blocking

Three improvements to read_file_tool to reduce wasted context tokens and
prevent process hangs:

1. Character-count guard: reads that produce more than 100K characters
   (≈25-35K tokens across tokenisers) are rejected with an error that
   tells the model to use offset+limit for a smaller range.  The
   effective cap is min(file_size, 100K) so small files that happen to
   have long lines aren't over-penalised.  Large truncated files also
   get a hint nudging toward targeted reads.

2. File-read deduplication: when the same (path, offset, limit) is read
   a second time and the file hasn't been modified (mtime unchanged),
   return a lightweight stub instead of re-sending the full content.
   Writes and patches naturally change mtime, so post-edit reads always
   return fresh content.  The dedup cache is cleared on context
   compression — after compression the original read content is
   summarised away, so the model needs the full content again.

3. Device path blocking: paths like /dev/zero, /dev/random, /dev/stdin
   etc. are rejected before any I/O to prevent process hangs from
   infinite-output or blocking-input devices.

Tests: 17 new tests covering all three features plus the dedup-reset-
on-compression integration.  All 52 file-read tests pass (35 existing +
17 new).  Full tool suite (2124 tests) passes with 0 failures.

* feat: make file_read_max_chars configurable, add docs

Add file_read_max_chars to DEFAULT_CONFIG (default 100K).  read_file_tool
reads this on first call and caches for the process lifetime.  Users on
large-context models can raise it; users on small local models can lower it.

Also adds a 'File Read Safety' section to the configuration docs
explaining the char limit, dedup behavior, and example values.
This commit is contained in:
Teknium 2026-03-31 12:53:19 -07:00 committed by GitHub
parent d3f1987a05
commit e3f8347be3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 605 additions and 10 deletions

View file

@ -15,6 +15,80 @@ logger = logging.getLogger(__name__)
_EXPECTED_WRITE_ERRNOS = {errno.EACCES, errno.EPERM, errno.EROFS}
# ---------------------------------------------------------------------------
# Read-size guard: cap the character count returned to the model.
# We're model-agnostic so we can't count tokens; characters are a safe proxy.
# 100K chars ≈ 2535K tokens across typical tokenisers. Files larger than
# this in a single read are a context-window hazard — the model should use
# offset+limit to read the relevant section.
#
# Configurable via config.yaml: file_read_max_chars: 200000
# ---------------------------------------------------------------------------
_DEFAULT_MAX_READ_CHARS = 100_000
_max_read_chars_cached: int | None = None
def _get_max_read_chars() -> int:
"""Return the configured max characters per file read.
Reads ``file_read_max_chars`` from config.yaml on first call, caches
the result for the lifetime of the process. Falls back to the
built-in default if the config is missing or invalid.
"""
global _max_read_chars_cached
if _max_read_chars_cached is not None:
return _max_read_chars_cached
try:
from hermes_cli.config import load_config
cfg = load_config()
val = cfg.get("file_read_max_chars")
if isinstance(val, (int, float)) and val > 0:
_max_read_chars_cached = int(val)
return _max_read_chars_cached
except Exception:
pass
_max_read_chars_cached = _DEFAULT_MAX_READ_CHARS
return _max_read_chars_cached
# If the total file size exceeds this AND the caller didn't specify a narrow
# range (limit <= 200), we include a hint encouraging targeted reads.
_LARGE_FILE_HINT_BYTES = 512_000 # 512 KB
# ---------------------------------------------------------------------------
# Device path blocklist — reading these hangs the process (infinite output
# or blocking on input). Checked by path only (no I/O).
# ---------------------------------------------------------------------------
_BLOCKED_DEVICE_PATHS = frozenset({
# Infinite output — never reach EOF
"/dev/zero", "/dev/random", "/dev/urandom", "/dev/full",
# Blocks waiting for input
"/dev/stdin", "/dev/tty", "/dev/console",
# Nonsensical to read
"/dev/stdout", "/dev/stderr",
# fd aliases
"/dev/fd/0", "/dev/fd/1", "/dev/fd/2",
})
def _is_blocked_device(filepath: str) -> bool:
"""Return True if the path would hang the process (infinite output or blocking input).
Uses the *literal* path no symlink resolution because the model
specifies paths directly and realpath follows symlinks all the way
through (e.g. /dev/stdin /proc/self/fd/0 /dev/pts/0), defeating
the check.
"""
normalized = os.path.expanduser(filepath)
if normalized in _BLOCKED_DEVICE_PATHS:
return True
# /proc/self/fd/0-2 and /proc/<pid>/fd/0-2 are Linux aliases for stdio
if normalized.startswith("/proc/") and normalized.endswith(
("/fd/0", "/fd/1", "/fd/2")
):
return True
return False
# Paths that file tools should refuse to write to without going through the
# terminal tool's approval system. These match prefixes after os.path.realpath.
_SENSITIVE_PATH_PREFIXES = ("/etc/", "/boot/", "/usr/lib/systemd/")
@ -53,11 +127,15 @@ def _is_expected_write_exception(exc: Exception) -> bool:
_file_ops_lock = threading.Lock()
_file_ops_cache: dict = {}
# Track files read per task to detect re-read loops after context compression.
# Track files read per task to detect re-read loops and deduplicate reads.
# Per task_id we store:
# "last_key": the key of the most recent read/search call (or None)
# "consecutive": how many times that exact call has been repeated in a row
# "read_history": set of (path, offset, limit) tuples for get_read_files_summary
# "dedup": dict mapping (resolved_path, offset, limit) → mtime float
# Used to skip re-reads of unchanged files. Reset on
# context compression (the original content is summarised
# away so the model needs the full content again).
_read_tracker_lock = threading.Lock()
_read_tracker: dict = {}
@ -195,8 +273,19 @@ def clear_file_ops_cache(task_id: str = None):
def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str = "default") -> str:
"""Read a file with pagination and line numbers."""
try:
# Security: block direct reads of internal Hermes cache/index files
# to prevent prompt injection via catalog or hub metadata files.
# ── Device path guard ─────────────────────────────────────────
# Block paths that would hang the process (infinite output,
# blocking on input). Pure path check — no I/O.
if _is_blocked_device(path):
return json.dumps({
"error": (
f"Cannot read '{path}': this is a device file that would "
"block or produce infinite output."
),
})
# ── Hermes internal path guard ────────────────────────────────
# Prevent prompt injection via catalog or hub metadata files.
import pathlib as _pathlib
from hermes_constants import get_hermes_home as _get_hh
_resolved = _pathlib.Path(path).expanduser().resolve()
@ -217,20 +306,83 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str =
})
except ValueError:
pass
# ── Dedup check ───────────────────────────────────────────────
# If we already read this exact (path, offset, limit) and the
# file hasn't been modified since, return a lightweight stub
# instead of re-sending the same content. Saves context tokens.
resolved_str = str(_resolved)
dedup_key = (resolved_str, offset, limit)
with _read_tracker_lock:
task_data = _read_tracker.setdefault(task_id, {
"last_key": None, "consecutive": 0,
"read_history": set(), "dedup": {},
})
cached_mtime = task_data.get("dedup", {}).get(dedup_key)
if cached_mtime is not None:
try:
current_mtime = os.path.getmtime(resolved_str)
if current_mtime == cached_mtime:
return json.dumps({
"content": (
"File unchanged since last read. The content from "
"the earlier read_file result in this conversation is "
"still current — refer to that instead of re-reading."
),
"path": path,
"dedup": True,
}, ensure_ascii=False)
except OSError:
pass # stat failed — fall through to full read
# ── Perform the read ──────────────────────────────────────────
file_ops = _get_file_ops(task_id)
result = file_ops.read_file(path, offset, limit)
if result.content:
result.content = redact_sensitive_text(result.content)
result_dict = result.to_dict()
# Track reads to detect *consecutive* re-read loops.
# The counter resets whenever any other tool is called in between,
# so only truly back-to-back identical reads trigger warnings/blocks.
# ── Character-count guard ─────────────────────────────────────
# We're model-agnostic so we can't count tokens; characters are
# the best proxy we have. If the read produced an unreasonable
# amount of content, reject it and tell the model to narrow down.
# Note: we check the formatted content (with line-number prefixes),
# not the raw file size, because that's what actually enters context.
content_len = len(result.content or "")
file_size = result_dict.get("file_size", 0)
max_chars = _get_max_read_chars()
if content_len > max_chars:
total_lines = result_dict.get("total_lines", "unknown")
return json.dumps({
"error": (
f"Read produced {content_len:,} characters which exceeds "
f"the safety limit ({max_chars:,} chars). "
"Use offset and limit to read a smaller range. "
f"The file has {total_lines} lines total."
),
"path": path,
"total_lines": total_lines,
"file_size": file_size,
}, ensure_ascii=False)
# Large-file hint: if the file is big and the caller didn't ask
# for a narrow window, nudge toward targeted reads.
if (file_size and file_size > _LARGE_FILE_HINT_BYTES
and limit > 200
and result_dict.get("truncated")):
result_dict.setdefault("_hint", (
f"This file is large ({file_size:,} bytes). "
"Consider reading only the section you need with offset and limit "
"to keep context usage efficient."
))
# ── Track for consecutive-loop detection ──────────────────────
read_key = ("read", path, offset, limit)
with _read_tracker_lock:
task_data = _read_tracker.setdefault(task_id, {
"last_key": None, "consecutive": 0, "read_history": set(),
})
# Ensure "dedup" key exists (backward compat with old tracker state)
if "dedup" not in task_data:
task_data["dedup"] = {}
task_data["read_history"].add((path, offset, limit))
if task_data["last_key"] == read_key:
task_data["consecutive"] += 1
@ -239,6 +391,15 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str =
task_data["consecutive"] = 1
count = task_data["consecutive"]
# Store dedup entry (mtime at read time).
# Writes/patches will naturally change mtime, so subsequent
# dedup checks after edits will see a different mtime and
# return the full content — no special handling needed.
try:
task_data["dedup"][dedup_key] = os.path.getmtime(resolved_str)
except OSError:
pass # Can't stat — skip dedup for this entry
if count >= 4:
# Hard block: stop returning content to break the loop
return json.dumps({
@ -296,6 +457,28 @@ def clear_read_tracker(task_id: str = None):
_read_tracker.clear()
def reset_file_dedup(task_id: str = None):
"""Clear the deduplication cache for file reads.
Called after context compression the original read content has been
summarised away, so the model needs the full content if it reads the
same file again. Without this, reads after compression would return
a "file unchanged" stub pointing at content that no longer exists in
context.
Call with a task_id to clear just that task, or without to clear all.
"""
with _read_tracker_lock:
if task_id:
task_data = _read_tracker.get(task_id)
if task_data and "dedup" in task_data:
task_data["dedup"].clear()
else:
for task_data in _read_tracker.values():
if "dedup" in task_data:
task_data["dedup"].clear()
def notify_other_tool_call(task_id: str = "default"):
"""Reset consecutive read/search counter for a task.
@ -466,7 +649,7 @@ def _check_file_reqs():
READ_FILE_SCHEMA = {
"name": "read_file",
"description": "Read a text file with line numbers and pagination. Use this instead of cat/head/tail in terminal. Output format: 'LINE_NUM|CONTENT'. Suggests similar filenames if not found. Use offset and limit for large files. NOTE: Cannot read images or binary files — use vision_analyze for images.",
"description": "Read a text file with line numbers and pagination. Use this instead of cat/head/tail in terminal. Output format: 'LINE_NUM|CONTENT'. Suggests similar filenames if not found. Use offset and limit for large files. Reads exceeding ~100K characters are rejected; use offset and limit to read specific sections of large files. NOTE: Cannot read images or binary files — use vision_analyze for images.",
"parameters": {
"type": "object",
"properties": {