feat(file_tools): harden read_file with size guard, dedup, and device blocking (#4315)

* feat(file_tools): harden read_file with size guard, dedup, and device blocking Three improvements to read_file_tool to reduce wasted context tokens and prevent process hangs: 1. Character-count guard: reads that produce more than 100K characters (≈25-35K tokens across tokenisers) are rejected with an error that tells the model to use offset+limit for a smaller range. The effective cap is min(file_size, 100K) so small files that happen to have long lines aren't over-penalised. Large truncated files also get a hint nudging toward targeted reads. 2. File-read deduplication: when the same (path, offset, limit) is read a second time and the file hasn't been modified (mtime unchanged), return a lightweight stub instead of re-sending the full content. Writes and patches naturally change mtime, so post-edit reads always return fresh content. The dedup cache is cleared on context compression — after compression the original read content is summarised away, so the model needs the full content again. 3. Device path blocking: paths like /dev/zero, /dev/random, /dev/stdin etc. are rejected before any I/O to prevent process hangs from infinite-output or blocking-input devices. Tests: 17 new tests covering all three features plus the dedup-reset- on-compression integration. All 52 file-read tests pass (35 existing + 17 new). Full tool suite (2124 tests) passes with 0 failures. * feat: make file_read_max_chars configurable, add docs Add file_read_max_chars to DEFAULT_CONFIG (default 100K). read_file_tool reads this on first call and caches for the process lifetime. Users on large-context models can raise it; users on small local models can lower it. Also adds a 'File Read Safety' section to the configuration docs explaining the char limit, dedup behavior, and example values.
2026-04-25 00:51:20 +00:00 · 2026-03-31 12:53:19 -07:00 · 2026-03-31 12:53:19 -07:00 · e3f8347be3
commit e3f8347be3
parent d3f1987a05
5 changed files with 605 additions and 10 deletions
--- a/tools/file_tools.py
+++ b/tools/file_tools.py
@ -15,6 +15,80 @@ logger = logging.getLogger(__name__)

 _EXPECTED_WRITE_ERRNOS = {errno.EACCES, errno.EPERM, errno.EROFS}

+# ---------------------------------------------------------------------------
+# Read-size guard: cap the character count returned to the model.
+# We're model-agnostic so we can't count tokens; characters are a safe proxy.
+# 100K chars ≈ 25–35K tokens across typical tokenisers.  Files larger than
+# this in a single read are a context-window hazard — the model should use
+# offset+limit to read the relevant section.
+#
+# Configurable via config.yaml:  file_read_max_chars: 200000
+# ---------------------------------------------------------------------------
+_DEFAULT_MAX_READ_CHARS = 100_000
+_max_read_chars_cached: int | None = None
+
+
+def _get_max_read_chars() -> int:
+    """Return the configured max characters per file read.
+
+    Reads ``file_read_max_chars`` from config.yaml on first call, caches
+    the result for the lifetime of the process.  Falls back to the
+    built-in default if the config is missing or invalid.
+    """
+    global _max_read_chars_cached
+    if _max_read_chars_cached is not None:
+        return _max_read_chars_cached
+    try:
+        from hermes_cli.config import load_config
+        cfg = load_config()
+        val = cfg.get("file_read_max_chars")
+        if isinstance(val, (int, float)) and val > 0:
+            _max_read_chars_cached = int(val)
+            return _max_read_chars_cached
+    except Exception:
+        pass
+    _max_read_chars_cached = _DEFAULT_MAX_READ_CHARS
+    return _max_read_chars_cached
+
+# If the total file size exceeds this AND the caller didn't specify a narrow
+# range (limit <= 200), we include a hint encouraging targeted reads.
+_LARGE_FILE_HINT_BYTES = 512_000  # 512 KB
+
+# ---------------------------------------------------------------------------
+# Device path blocklist — reading these hangs the process (infinite output
+# or blocking on input).  Checked by path only (no I/O).
+# ---------------------------------------------------------------------------
+_BLOCKED_DEVICE_PATHS = frozenset({
+    # Infinite output — never reach EOF
+    "/dev/zero", "/dev/random", "/dev/urandom", "/dev/full",
+    # Blocks waiting for input
+    "/dev/stdin", "/dev/tty", "/dev/console",
+    # Nonsensical to read
+    "/dev/stdout", "/dev/stderr",
+    # fd aliases
+    "/dev/fd/0", "/dev/fd/1", "/dev/fd/2",
+})
+
+
+def _is_blocked_device(filepath: str) -> bool:
+    """Return True if the path would hang the process (infinite output or blocking input).
+
+    Uses the *literal* path — no symlink resolution — because the model
+    specifies paths directly and realpath follows symlinks all the way
+    through (e.g. /dev/stdin → /proc/self/fd/0 → /dev/pts/0), defeating
+    the check.
+    """
+    normalized = os.path.expanduser(filepath)
+    if normalized in _BLOCKED_DEVICE_PATHS:
+        return True
+    # /proc/self/fd/0-2 and /proc/<pid>/fd/0-2 are Linux aliases for stdio
+    if normalized.startswith("/proc/") and normalized.endswith(
+        ("/fd/0", "/fd/1", "/fd/2")
+    ):
+        return True
+    return False
+
+
 # Paths that file tools should refuse to write to without going through the
 # terminal tool's approval system.  These match prefixes after os.path.realpath.
 _SENSITIVE_PATH_PREFIXES = ("/etc/", "/boot/", "/usr/lib/systemd/")
@ -53,11 +127,15 @@ def _is_expected_write_exception(exc: Exception) -> bool:
 _file_ops_lock = threading.Lock()
 _file_ops_cache: dict = {}

-# Track files read per task to detect re-read loops after context compression.
+# Track files read per task to detect re-read loops and deduplicate reads.
 # Per task_id we store:
 #   "last_key":     the key of the most recent read/search call (or None)
 #   "consecutive":  how many times that exact call has been repeated in a row
 #   "read_history": set of (path, offset, limit) tuples for get_read_files_summary
+#   "dedup":        dict mapping (resolved_path, offset, limit) → mtime float
+#                   Used to skip re-reads of unchanged files.  Reset on
+#                   context compression (the original content is summarised
+#                   away so the model needs the full content again).
 _read_tracker_lock = threading.Lock()
 _read_tracker: dict = {}

@ -195,8 +273,19 @@ def clear_file_ops_cache(task_id: str = None):
 def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str = "default") -> str:
    """Read a file with pagination and line numbers."""
    try:
-        # Security: block direct reads of internal Hermes cache/index files
-        # to prevent prompt injection via catalog or hub metadata files.
+        # ── Device path guard ─────────────────────────────────────────
+        # Block paths that would hang the process (infinite output,
+        # blocking on input).  Pure path check — no I/O.
+        if _is_blocked_device(path):
+            return json.dumps({
+                "error": (
+                    f"Cannot read '{path}': this is a device file that would "
+                    "block or produce infinite output."
+                ),
+            })
+
+        # ── Hermes internal path guard ────────────────────────────────
+        # Prevent prompt injection via catalog or hub metadata files.
        import pathlib as _pathlib
        from hermes_constants import get_hermes_home as _get_hh
        _resolved = _pathlib.Path(path).expanduser().resolve()
@ -217,20 +306,83 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str =
                })
            except ValueError:
                pass
+
+        # ── Dedup check ───────────────────────────────────────────────
+        # If we already read this exact (path, offset, limit) and the
+        # file hasn't been modified since, return a lightweight stub
+        # instead of re-sending the same content.  Saves context tokens.
+        resolved_str = str(_resolved)
+        dedup_key = (resolved_str, offset, limit)
+        with _read_tracker_lock:
+            task_data = _read_tracker.setdefault(task_id, {
+                "last_key": None, "consecutive": 0,
+                "read_history": set(), "dedup": {},
+            })
+            cached_mtime = task_data.get("dedup", {}).get(dedup_key)
+
+        if cached_mtime is not None:
+            try:
+                current_mtime = os.path.getmtime(resolved_str)
+                if current_mtime == cached_mtime:
+                    return json.dumps({
+                        "content": (
+                            "File unchanged since last read. The content from "
+                            "the earlier read_file result in this conversation is "
+                            "still current — refer to that instead of re-reading."
+                        ),
+                        "path": path,
+                        "dedup": True,
+                    }, ensure_ascii=False)
+            except OSError:
+                pass  # stat failed — fall through to full read
+
+        # ── Perform the read ──────────────────────────────────────────
        file_ops = _get_file_ops(task_id)
        result = file_ops.read_file(path, offset, limit)
        if result.content:
            result.content = redact_sensitive_text(result.content)
        result_dict = result.to_dict()

-        # Track reads to detect *consecutive* re-read loops.
-        # The counter resets whenever any other tool is called in between,
-        # so only truly back-to-back identical reads trigger warnings/blocks.
+        # ── Character-count guard ─────────────────────────────────────
+        # We're model-agnostic so we can't count tokens; characters are
+        # the best proxy we have.  If the read produced an unreasonable
+        # amount of content, reject it and tell the model to narrow down.
+        # Note: we check the formatted content (with line-number prefixes),
+        # not the raw file size, because that's what actually enters context.
+        content_len = len(result.content or "")
+        file_size = result_dict.get("file_size", 0)
+        max_chars = _get_max_read_chars()
+        if content_len > max_chars:
+            total_lines = result_dict.get("total_lines", "unknown")
+            return json.dumps({
+                "error": (
+                    f"Read produced {content_len:,} characters which exceeds "
+                    f"the safety limit ({max_chars:,} chars). "
+                    "Use offset and limit to read a smaller range. "
+                    f"The file has {total_lines} lines total."
+                ),
+                "path": path,
+                "total_lines": total_lines,
+                "file_size": file_size,
+            }, ensure_ascii=False)
+
+        # Large-file hint: if the file is big and the caller didn't ask
+        # for a narrow window, nudge toward targeted reads.
+        if (file_size and file_size > _LARGE_FILE_HINT_BYTES
+                and limit > 200
+                and result_dict.get("truncated")):
+            result_dict.setdefault("_hint", (
+                f"This file is large ({file_size:,} bytes). "
+                "Consider reading only the section you need with offset and limit "
+                "to keep context usage efficient."
+            ))
+
+        # ── Track for consecutive-loop detection ──────────────────────
        read_key = ("read", path, offset, limit)
        with _read_tracker_lock:
-            task_data = _read_tracker.setdefault(task_id, {
-                "last_key": None, "consecutive": 0, "read_history": set(),
-            })
+            # Ensure "dedup" key exists (backward compat with old tracker state)
+            if "dedup" not in task_data:
+                task_data["dedup"] = {}
            task_data["read_history"].add((path, offset, limit))
            if task_data["last_key"] == read_key:
                task_data["consecutive"] += 1
@ -239,6 +391,15 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str =
                task_data["consecutive"] = 1
            count = task_data["consecutive"]

+            # Store dedup entry (mtime at read time).
+            # Writes/patches will naturally change mtime, so subsequent
+            # dedup checks after edits will see a different mtime and
+            # return the full content — no special handling needed.
+            try:
+                task_data["dedup"][dedup_key] = os.path.getmtime(resolved_str)
+            except OSError:
+                pass  # Can't stat — skip dedup for this entry
+
        if count >= 4:
            # Hard block: stop returning content to break the loop
            return json.dumps({
@ -296,6 +457,28 @@ def clear_read_tracker(task_id: str = None):
            _read_tracker.clear()


+def reset_file_dedup(task_id: str = None):
+    """Clear the deduplication cache for file reads.
+
+    Called after context compression — the original read content has been
+    summarised away, so the model needs the full content if it reads the
+    same file again.  Without this, reads after compression would return
+    a "file unchanged" stub pointing at content that no longer exists in
+    context.
+
+    Call with a task_id to clear just that task, or without to clear all.
+    """
+    with _read_tracker_lock:
+        if task_id:
+            task_data = _read_tracker.get(task_id)
+            if task_data and "dedup" in task_data:
+                task_data["dedup"].clear()
+        else:
+            for task_data in _read_tracker.values():
+                if "dedup" in task_data:
+                    task_data["dedup"].clear()
+
+
 def notify_other_tool_call(task_id: str = "default"):
    """Reset consecutive read/search counter for a task.

@ -466,7 +649,7 @@ def _check_file_reqs():

 READ_FILE_SCHEMA = {
    "name": "read_file",
-    "description": "Read a text file with line numbers and pagination. Use this instead of cat/head/tail in terminal. Output format: 'LINE_NUM|CONTENT'. Suggests similar filenames if not found. Use offset and limit for large files. NOTE: Cannot read images or binary files — use vision_analyze for images.",
+    "description": "Read a text file with line numbers and pagination. Use this instead of cat/head/tail in terminal. Output format: 'LINE_NUM|CONTENT'. Suggests similar filenames if not found. Use offset and limit for large files. Reads exceeding ~100K characters are rejected; use offset and limit to read specific sections of large files. NOTE: Cannot read images or binary files — use vision_analyze for images.",
    "parameters": {
        "type": "object",
        "properties": {