fix(local): recover when persistent_shell cwd is deleted (#17558)

When a tool call deletes its own working directory (`cd /tmp/foo && rm -rf /tmp/foo`), the next `subprocess.Popen(args, cwd=self.cwd)` raised `FileNotFoundError: [Errno 2]` before bash even started — every subsequent terminal/file-tool call hit the same wedge until the gateway restarted. Fix in `LocalEnvironment._run_bash`: before handing `self.cwd` to Popen, resolve a safe alternative when the path is gone (walk up to the nearest existing ancestor, falling back to `tempfile.gettempdir()` only as a last resort). Log a warning so the recovery is visible — not silent — and update `self.cwd` so the next call doesn't repeat the message. Defense in depth in `LocalEnvironment._update_cwd`: only adopt the new cwd when it still exists as a directory. `pwd -P` from a deleted cwd can leave a stale value in the marker file; refusing to store a missing path keeps `self.cwd` valid by construction. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-07 02:51:50 +00:00 · 2026-04-29 10:16:22 -07:00 · 2026-04-29 10:16:22 -07:00 · 9644b8ae67
commit 9644b8ae67
parent b8fb9270c4
2 changed files with 199 additions and 2 deletions
--- a/tests/tools/test_local_env_cwd_recovery.py
+++ b/tests/tools/test_local_env_cwd_recovery.py
@ -0,0 +1,151 @@
 """Tests for LocalEnvironment recovery when ``self.cwd`` is deleted.
 When a tool call inside the persistent terminal session ``rm -rf``'s its own
 working directory, the next ``subprocess.Popen(..., cwd=self.cwd)`` would
 otherwise raise ``FileNotFoundError`` before bash starts, wedging every
 subsequent terminal/file-tool call until the gateway restarts.
 Regression coverage for https://github.com/NousResearch/hermes-agent/issues/17558.
 """
 import os
 import shutil
 import tempfile
 import threading
 from unittest.mock import MagicMock, patch
 from tools.environments.local import (
    LocalEnvironment,
    _resolve_safe_cwd,
 )
 class TestResolveSafeCwd:
    """Pure-function unit tests for the recovery helper."""
    def test_returns_cwd_when_directory_exists(self, tmp_path):
        path = str(tmp_path)
        assert _resolve_safe_cwd(path) == path
    def test_walks_up_to_first_existing_ancestor(self, tmp_path):
        nested = tmp_path / "child" / "grandchild"
        nested.mkdir(parents=True)
        deleted = str(nested)
        shutil.rmtree(tmp_path / "child")
        # The deepest existing ancestor on the path is tmp_path itself.
        assert _resolve_safe_cwd(deleted) == str(tmp_path)
    def test_falls_back_when_path_is_empty(self):
        assert _resolve_safe_cwd("") == tempfile.gettempdir()
    def test_returns_tempdir_when_nothing_on_path_exists(self, monkeypatch):
        monkeypatch.setattr(os.path, "isdir", lambda p: False)
        assert _resolve_safe_cwd("/no/such/dir") == tempfile.gettempdir()
 def _fake_interrupt():
    return threading.Event()
 def _make_fake_popen(captured: dict):
    def fake_popen(cmd, **kwargs):
        captured["cwd"] = kwargs.get("cwd")
        captured["env"] = kwargs.get("env", {})
        proc = MagicMock()
        proc.poll.return_value = 0
        proc.returncode = 0
        proc.stdout = MagicMock(
            __iter__=lambda s: iter([]),
            __next__=lambda s: (_ for _ in ()).throw(StopIteration),
        )
        proc.stdin = MagicMock()
        return proc
    return fake_popen
 class TestRunBashCwdRecovery:
    """End-to-end recovery: deleted ``self.cwd`` must not crash Popen."""
    def test_recovers_when_cwd_deleted_after_init(self, tmp_path, caplog):
        """Reproduces the wedge from #17558: cwd was valid when the
        snapshot was taken, but a subsequent command deleted it before the
        next ``Popen``."""
        wedged = tmp_path / "wedge-repro"
        wedged.mkdir()
        with patch.object(LocalEnvironment, "init_session", autospec=True, return_value=None):
            env = LocalEnvironment(cwd=str(wedged), timeout=10)
        # The previous tool call deleted the working directory.
        shutil.rmtree(wedged)
        assert env.cwd == str(wedged) and not os.path.isdir(env.cwd)
        captured = {}
        with patch("tools.environments.local._find_bash", return_value="/bin/bash"), \
             patch("subprocess.Popen", side_effect=_make_fake_popen(captured)), \
             patch("tools.terminal_tool._interrupt_event", _fake_interrupt()), \
             caplog.at_level("WARNING", logger="tools.environments.local"):
            env.execute("echo hello")
        # Popen must have been handed a real, existing directory.
        assert captured["cwd"] == str(tmp_path)
        assert os.path.isdir(captured["cwd"])
        # ``self.cwd`` is updated so the next call doesn't re-warn.
        assert env.cwd == str(tmp_path)
        # The warning surfaces the wedge so it isn't silently masked.
        assert any("missing on disk" in rec.message for rec in caplog.records)
    def test_no_warning_when_cwd_still_exists(self, tmp_path, caplog):
        with patch.object(LocalEnvironment, "init_session", autospec=True, return_value=None):
            env = LocalEnvironment(cwd=str(tmp_path), timeout=10)
        captured = {}
        with patch("tools.environments.local._find_bash", return_value="/bin/bash"), \
             patch("subprocess.Popen", side_effect=_make_fake_popen(captured)), \
             patch("tools.terminal_tool._interrupt_event", _fake_interrupt()), \
             caplog.at_level("WARNING", logger="tools.environments.local"):
            env.execute("echo hello")
        assert captured["cwd"] == str(tmp_path)
        assert env.cwd == str(tmp_path)
        assert not any("missing on disk" in rec.message for rec in caplog.records)
 class TestUpdateCwdRejectsMissingPaths:
    """``_update_cwd`` must not propagate a deleted path back into ``self.cwd``."""
    def test_skips_assignment_when_marker_path_missing(self, tmp_path):
        original = tmp_path / "starting"
        original.mkdir()
        with patch.object(LocalEnvironment, "init_session", autospec=True, return_value=None):
            env = LocalEnvironment(cwd=str(original), timeout=10)
        # Simulate the stale-marker case: the prior command's ``pwd -P`` left
        # a path in the cwd file, but that path has since been deleted.
        deleted = tmp_path / "wedge-repro"
        with open(env._cwd_file, "w") as f:
            f.write(str(deleted))
        env._update_cwd({"output": "", "returncode": 0})
        assert env.cwd == str(original)
    def test_accepts_assignment_when_marker_path_exists(self, tmp_path):
        original = tmp_path / "starting"
        original.mkdir()
        new_dir = tmp_path / "next"
        new_dir.mkdir()
        with patch.object(LocalEnvironment, "init_session", autospec=True, return_value=None):
            env = LocalEnvironment(cwd=str(original), timeout=10)
        with open(env._cwd_file, "w") as f:
            f.write(str(new_dir))
        env._update_cwd({"output": "", "returncode": 0})
        assert env.cwd == str(new_dir)
--- a/tools/environments/local.py
+++ b/tools/environments/local.py
@ -1,5 +1,6 @@
 """Local execution environment — spawn-per-call with session snapshot."""
 import logging
 import os
 import platform
 import shutil
@ -12,6 +13,30 @@ from tools.environments.base import BaseEnvironment, _pipe_stdin
 _IS_WINDOWS = platform.system() == "Windows"
 logger = logging.getLogger(__name__)
 def _resolve_safe_cwd(cwd: str) -> str:
    """Return ``cwd`` if it exists as a directory, else the nearest existing
    ancestor.  Falls back to ``tempfile.gettempdir()`` only if walking up the
    path can't find any existing directory (effectively never on a healthy
    filesystem, but cheap belt-and-braces).
    Used by ``_run_bash`` to recover when the configured cwd is gone — most
    commonly because a previous tool call deleted its own working directory
    (issue #17558).  Without this guard, ``subprocess.Popen(..., cwd=...)``
    raises ``FileNotFoundError`` before bash starts, wedging every subsequent
    terminal call until the gateway restarts.
    """
    if cwd and os.path.isdir(cwd):
        return cwd
    parent = os.path.dirname(cwd) if cwd else ""
    while parent and parent != os.path.dirname(parent):
        if os.path.isdir(parent):
            return parent
        parent = os.path.dirname(parent)
    return tempfile.gettempdir()
 # Hermes-internal env vars that should NOT leak into terminal subprocesses.
 _HERMES_PROVIDER_ENV_FORCE_PREFIX = "_HERMES_FORCE_"
@ -358,6 +383,21 @@ class LocalEnvironment(BaseEnvironment):
        args = [bash, "-l", "-c", cmd_string] if login else [bash, "-c", cmd_string]
        run_env = _make_run_env(self.env)
        # Recover when the cwd has been deleted out from under us — usually by
        # a previous tool call that ran ``rm -rf`` on its own working dir
        # (issue #17558).  Popen would otherwise raise FileNotFoundError on
        # the cwd before bash starts, wedging every subsequent call until the
        # gateway restarts.
        safe_cwd = _resolve_safe_cwd(self.cwd)
        if safe_cwd != self.cwd:
            logger.warning(
                "LocalEnvironment cwd %r is missing on disk; "
                "falling back to %r so terminal commands keep working.",
                self.cwd,
                safe_cwd,
            )
            self.cwd = safe_cwd
        proc = subprocess.Popen(
            args,
            text=True,
@ -452,11 +492,17 @@ class LocalEnvironment(BaseEnvironment):
                pass
    def _update_cwd(self, result: dict):
-        """Read CWD from temp file (local-only, no round-trip needed)."""
+        """Read CWD from temp file (local-only, no round-trip needed).
        Skip the assignment when the path no longer exists as a directory —
        ``pwd -P`` on a deleted cwd can leave a stale value in the marker
        file, and propagating it would re-wedge the next ``Popen``.  The
        ``_run_bash`` recovery path will resolve a safe fallback if needed.
        """
        try:
            with open(self._cwd_file) as f:
                cwd_path = f.read().strip()
-            if cwd_path:
+            if cwd_path and os.path.isdir(cwd_path):
                self.cwd = cwd_path
        except (OSError, FileNotFoundError):
            pass