mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(file_tools): harden read_file with size guard, dedup, and device blocking (#4315)
* feat(file_tools): harden read_file with size guard, dedup, and device blocking Three improvements to read_file_tool to reduce wasted context tokens and prevent process hangs: 1. Character-count guard: reads that produce more than 100K characters (≈25-35K tokens across tokenisers) are rejected with an error that tells the model to use offset+limit for a smaller range. The effective cap is min(file_size, 100K) so small files that happen to have long lines aren't over-penalised. Large truncated files also get a hint nudging toward targeted reads. 2. File-read deduplication: when the same (path, offset, limit) is read a second time and the file hasn't been modified (mtime unchanged), return a lightweight stub instead of re-sending the full content. Writes and patches naturally change mtime, so post-edit reads always return fresh content. The dedup cache is cleared on context compression — after compression the original read content is summarised away, so the model needs the full content again. 3. Device path blocking: paths like /dev/zero, /dev/random, /dev/stdin etc. are rejected before any I/O to prevent process hangs from infinite-output or blocking-input devices. Tests: 17 new tests covering all three features plus the dedup-reset- on-compression integration. All 52 file-read tests pass (35 existing + 17 new). Full tool suite (2124 tests) passes with 0 failures. * feat: make file_read_max_chars configurable, add docs Add file_read_max_chars to DEFAULT_CONFIG (default 100K). read_file_tool reads this on first call and caches for the process lifetime. Users on large-context models can raise it; users on small local models can lower it. Also adds a 'File Read Safety' section to the configuration docs explaining the char limit, dedup behavior, and example values.
This commit is contained in:
parent
d3f1987a05
commit
e3f8347be3
5 changed files with 605 additions and 10 deletions
|
|
@ -15,6 +15,80 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
_EXPECTED_WRITE_ERRNOS = {errno.EACCES, errno.EPERM, errno.EROFS}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Read-size guard: cap the character count returned to the model.
|
||||
# We're model-agnostic so we can't count tokens; characters are a safe proxy.
|
||||
# 100K chars ≈ 25–35K tokens across typical tokenisers. Files larger than
|
||||
# this in a single read are a context-window hazard — the model should use
|
||||
# offset+limit to read the relevant section.
|
||||
#
|
||||
# Configurable via config.yaml: file_read_max_chars: 200000
|
||||
# ---------------------------------------------------------------------------
|
||||
_DEFAULT_MAX_READ_CHARS = 100_000
|
||||
_max_read_chars_cached: int | None = None
|
||||
|
||||
|
||||
def _get_max_read_chars() -> int:
|
||||
"""Return the configured max characters per file read.
|
||||
|
||||
Reads ``file_read_max_chars`` from config.yaml on first call, caches
|
||||
the result for the lifetime of the process. Falls back to the
|
||||
built-in default if the config is missing or invalid.
|
||||
"""
|
||||
global _max_read_chars_cached
|
||||
if _max_read_chars_cached is not None:
|
||||
return _max_read_chars_cached
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
cfg = load_config()
|
||||
val = cfg.get("file_read_max_chars")
|
||||
if isinstance(val, (int, float)) and val > 0:
|
||||
_max_read_chars_cached = int(val)
|
||||
return _max_read_chars_cached
|
||||
except Exception:
|
||||
pass
|
||||
_max_read_chars_cached = _DEFAULT_MAX_READ_CHARS
|
||||
return _max_read_chars_cached
|
||||
|
||||
# If the total file size exceeds this AND the caller didn't specify a narrow
|
||||
# range (limit <= 200), we include a hint encouraging targeted reads.
|
||||
_LARGE_FILE_HINT_BYTES = 512_000 # 512 KB
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Device path blocklist — reading these hangs the process (infinite output
|
||||
# or blocking on input). Checked by path only (no I/O).
|
||||
# ---------------------------------------------------------------------------
|
||||
_BLOCKED_DEVICE_PATHS = frozenset({
|
||||
# Infinite output — never reach EOF
|
||||
"/dev/zero", "/dev/random", "/dev/urandom", "/dev/full",
|
||||
# Blocks waiting for input
|
||||
"/dev/stdin", "/dev/tty", "/dev/console",
|
||||
# Nonsensical to read
|
||||
"/dev/stdout", "/dev/stderr",
|
||||
# fd aliases
|
||||
"/dev/fd/0", "/dev/fd/1", "/dev/fd/2",
|
||||
})
|
||||
|
||||
|
||||
def _is_blocked_device(filepath: str) -> bool:
|
||||
"""Return True if the path would hang the process (infinite output or blocking input).
|
||||
|
||||
Uses the *literal* path — no symlink resolution — because the model
|
||||
specifies paths directly and realpath follows symlinks all the way
|
||||
through (e.g. /dev/stdin → /proc/self/fd/0 → /dev/pts/0), defeating
|
||||
the check.
|
||||
"""
|
||||
normalized = os.path.expanduser(filepath)
|
||||
if normalized in _BLOCKED_DEVICE_PATHS:
|
||||
return True
|
||||
# /proc/self/fd/0-2 and /proc/<pid>/fd/0-2 are Linux aliases for stdio
|
||||
if normalized.startswith("/proc/") and normalized.endswith(
|
||||
("/fd/0", "/fd/1", "/fd/2")
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# Paths that file tools should refuse to write to without going through the
|
||||
# terminal tool's approval system. These match prefixes after os.path.realpath.
|
||||
_SENSITIVE_PATH_PREFIXES = ("/etc/", "/boot/", "/usr/lib/systemd/")
|
||||
|
|
@ -53,11 +127,15 @@ def _is_expected_write_exception(exc: Exception) -> bool:
|
|||
_file_ops_lock = threading.Lock()
|
||||
_file_ops_cache: dict = {}
|
||||
|
||||
# Track files read per task to detect re-read loops after context compression.
|
||||
# Track files read per task to detect re-read loops and deduplicate reads.
|
||||
# Per task_id we store:
|
||||
# "last_key": the key of the most recent read/search call (or None)
|
||||
# "consecutive": how many times that exact call has been repeated in a row
|
||||
# "read_history": set of (path, offset, limit) tuples for get_read_files_summary
|
||||
# "dedup": dict mapping (resolved_path, offset, limit) → mtime float
|
||||
# Used to skip re-reads of unchanged files. Reset on
|
||||
# context compression (the original content is summarised
|
||||
# away so the model needs the full content again).
|
||||
_read_tracker_lock = threading.Lock()
|
||||
_read_tracker: dict = {}
|
||||
|
||||
|
|
@ -195,8 +273,19 @@ def clear_file_ops_cache(task_id: str = None):
|
|||
def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str = "default") -> str:
|
||||
"""Read a file with pagination and line numbers."""
|
||||
try:
|
||||
# Security: block direct reads of internal Hermes cache/index files
|
||||
# to prevent prompt injection via catalog or hub metadata files.
|
||||
# ── Device path guard ─────────────────────────────────────────
|
||||
# Block paths that would hang the process (infinite output,
|
||||
# blocking on input). Pure path check — no I/O.
|
||||
if _is_blocked_device(path):
|
||||
return json.dumps({
|
||||
"error": (
|
||||
f"Cannot read '{path}': this is a device file that would "
|
||||
"block or produce infinite output."
|
||||
),
|
||||
})
|
||||
|
||||
# ── Hermes internal path guard ────────────────────────────────
|
||||
# Prevent prompt injection via catalog or hub metadata files.
|
||||
import pathlib as _pathlib
|
||||
from hermes_constants import get_hermes_home as _get_hh
|
||||
_resolved = _pathlib.Path(path).expanduser().resolve()
|
||||
|
|
@ -217,20 +306,83 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str =
|
|||
})
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# ── Dedup check ───────────────────────────────────────────────
|
||||
# If we already read this exact (path, offset, limit) and the
|
||||
# file hasn't been modified since, return a lightweight stub
|
||||
# instead of re-sending the same content. Saves context tokens.
|
||||
resolved_str = str(_resolved)
|
||||
dedup_key = (resolved_str, offset, limit)
|
||||
with _read_tracker_lock:
|
||||
task_data = _read_tracker.setdefault(task_id, {
|
||||
"last_key": None, "consecutive": 0,
|
||||
"read_history": set(), "dedup": {},
|
||||
})
|
||||
cached_mtime = task_data.get("dedup", {}).get(dedup_key)
|
||||
|
||||
if cached_mtime is not None:
|
||||
try:
|
||||
current_mtime = os.path.getmtime(resolved_str)
|
||||
if current_mtime == cached_mtime:
|
||||
return json.dumps({
|
||||
"content": (
|
||||
"File unchanged since last read. The content from "
|
||||
"the earlier read_file result in this conversation is "
|
||||
"still current — refer to that instead of re-reading."
|
||||
),
|
||||
"path": path,
|
||||
"dedup": True,
|
||||
}, ensure_ascii=False)
|
||||
except OSError:
|
||||
pass # stat failed — fall through to full read
|
||||
|
||||
# ── Perform the read ──────────────────────────────────────────
|
||||
file_ops = _get_file_ops(task_id)
|
||||
result = file_ops.read_file(path, offset, limit)
|
||||
if result.content:
|
||||
result.content = redact_sensitive_text(result.content)
|
||||
result_dict = result.to_dict()
|
||||
|
||||
# Track reads to detect *consecutive* re-read loops.
|
||||
# The counter resets whenever any other tool is called in between,
|
||||
# so only truly back-to-back identical reads trigger warnings/blocks.
|
||||
# ── Character-count guard ─────────────────────────────────────
|
||||
# We're model-agnostic so we can't count tokens; characters are
|
||||
# the best proxy we have. If the read produced an unreasonable
|
||||
# amount of content, reject it and tell the model to narrow down.
|
||||
# Note: we check the formatted content (with line-number prefixes),
|
||||
# not the raw file size, because that's what actually enters context.
|
||||
content_len = len(result.content or "")
|
||||
file_size = result_dict.get("file_size", 0)
|
||||
max_chars = _get_max_read_chars()
|
||||
if content_len > max_chars:
|
||||
total_lines = result_dict.get("total_lines", "unknown")
|
||||
return json.dumps({
|
||||
"error": (
|
||||
f"Read produced {content_len:,} characters which exceeds "
|
||||
f"the safety limit ({max_chars:,} chars). "
|
||||
"Use offset and limit to read a smaller range. "
|
||||
f"The file has {total_lines} lines total."
|
||||
),
|
||||
"path": path,
|
||||
"total_lines": total_lines,
|
||||
"file_size": file_size,
|
||||
}, ensure_ascii=False)
|
||||
|
||||
# Large-file hint: if the file is big and the caller didn't ask
|
||||
# for a narrow window, nudge toward targeted reads.
|
||||
if (file_size and file_size > _LARGE_FILE_HINT_BYTES
|
||||
and limit > 200
|
||||
and result_dict.get("truncated")):
|
||||
result_dict.setdefault("_hint", (
|
||||
f"This file is large ({file_size:,} bytes). "
|
||||
"Consider reading only the section you need with offset and limit "
|
||||
"to keep context usage efficient."
|
||||
))
|
||||
|
||||
# ── Track for consecutive-loop detection ──────────────────────
|
||||
read_key = ("read", path, offset, limit)
|
||||
with _read_tracker_lock:
|
||||
task_data = _read_tracker.setdefault(task_id, {
|
||||
"last_key": None, "consecutive": 0, "read_history": set(),
|
||||
})
|
||||
# Ensure "dedup" key exists (backward compat with old tracker state)
|
||||
if "dedup" not in task_data:
|
||||
task_data["dedup"] = {}
|
||||
task_data["read_history"].add((path, offset, limit))
|
||||
if task_data["last_key"] == read_key:
|
||||
task_data["consecutive"] += 1
|
||||
|
|
@ -239,6 +391,15 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str =
|
|||
task_data["consecutive"] = 1
|
||||
count = task_data["consecutive"]
|
||||
|
||||
# Store dedup entry (mtime at read time).
|
||||
# Writes/patches will naturally change mtime, so subsequent
|
||||
# dedup checks after edits will see a different mtime and
|
||||
# return the full content — no special handling needed.
|
||||
try:
|
||||
task_data["dedup"][dedup_key] = os.path.getmtime(resolved_str)
|
||||
except OSError:
|
||||
pass # Can't stat — skip dedup for this entry
|
||||
|
||||
if count >= 4:
|
||||
# Hard block: stop returning content to break the loop
|
||||
return json.dumps({
|
||||
|
|
@ -296,6 +457,28 @@ def clear_read_tracker(task_id: str = None):
|
|||
_read_tracker.clear()
|
||||
|
||||
|
||||
def reset_file_dedup(task_id: str = None):
|
||||
"""Clear the deduplication cache for file reads.
|
||||
|
||||
Called after context compression — the original read content has been
|
||||
summarised away, so the model needs the full content if it reads the
|
||||
same file again. Without this, reads after compression would return
|
||||
a "file unchanged" stub pointing at content that no longer exists in
|
||||
context.
|
||||
|
||||
Call with a task_id to clear just that task, or without to clear all.
|
||||
"""
|
||||
with _read_tracker_lock:
|
||||
if task_id:
|
||||
task_data = _read_tracker.get(task_id)
|
||||
if task_data and "dedup" in task_data:
|
||||
task_data["dedup"].clear()
|
||||
else:
|
||||
for task_data in _read_tracker.values():
|
||||
if "dedup" in task_data:
|
||||
task_data["dedup"].clear()
|
||||
|
||||
|
||||
def notify_other_tool_call(task_id: str = "default"):
|
||||
"""Reset consecutive read/search counter for a task.
|
||||
|
||||
|
|
@ -466,7 +649,7 @@ def _check_file_reqs():
|
|||
|
||||
READ_FILE_SCHEMA = {
|
||||
"name": "read_file",
|
||||
"description": "Read a text file with line numbers and pagination. Use this instead of cat/head/tail in terminal. Output format: 'LINE_NUM|CONTENT'. Suggests similar filenames if not found. Use offset and limit for large files. NOTE: Cannot read images or binary files — use vision_analyze for images.",
|
||||
"description": "Read a text file with line numbers and pagination. Use this instead of cat/head/tail in terminal. Output format: 'LINE_NUM|CONTENT'. Suggests similar filenames if not found. Use offset and limit for large files. Reads exceeding ~100K characters are rejected; use offset and limit to read specific sections of large files. NOTE: Cannot read images or binary files — use vision_analyze for images.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue