mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(execute_code): add project/strict execution modes, default to project (#11971)
Weaker models (Gemma-class) repeatedly rediscover and forget that
execute_code uses a different CWD and Python interpreter than terminal(),
causing them to flip-flop on whether user files exist and to hit import
errors on project dependencies like pandas.
Adds a new 'code_execution.mode' config key (default 'project') that
brings execute_code into line with terminal()'s filesystem/interpreter:
project (new default):
- cwd = session's TERMINAL_CWD (falls back to os.getcwd())
- python = active VIRTUAL_ENV/bin/python or CONDA_PREFIX/bin/python
with a Python 3.8+ version check; falls back cleanly to
sys.executable if no venv or the candidate fails
- result : 'import pandas' works, '.env' resolves, matches terminal()
strict (opt-in):
- cwd = staging tmpdir (today's behavior)
- python = sys.executable (today's behavior)
- result : maximum reproducibility and isolation; project deps
won't resolve
Security-critical invariants are identical across both modes and covered by
explicit regression tests:
- env scrubbing (strips *_API_KEY, *_TOKEN, *_SECRET, *_PASSWORD,
*_CREDENTIAL, *_PASSWD, *_AUTH substrings)
- SANDBOX_ALLOWED_TOOLS whitelist (no execute_code recursion, no
delegate_task, no MCP from inside scripts)
- resource caps (5-min timeout, 50KB stdout, 50 tool calls)
Deliberately avoids 'sandbox'/'isolated'/'cloud' language in tool
descriptions (regression from commit 39b83f34 where agents on local
backends falsely believed they were sandboxed and refused networking).
Override via env var: HERMES_EXECUTE_CODE_MODE=strict|project
This commit is contained in:
parent
54e0eb24c0
commit
285bb2b915
5 changed files with 643 additions and 14 deletions
|
|
@ -771,6 +771,20 @@ DEFAULT_CONFIG = {
|
|||
"wrap_response": True,
|
||||
},
|
||||
|
||||
# execute_code settings — controls the tool used for programmatic tool calls.
|
||||
"code_execution": {
|
||||
# Execution mode:
|
||||
# project (default) — scripts run in the session's working directory
|
||||
# with the active virtualenv/conda env's python, so project deps
|
||||
# (pandas, torch, project packages) and relative paths resolve.
|
||||
# strict — scripts run in an isolated temp directory with
|
||||
# hermes-agent's own python (sys.executable). Maximum isolation
|
||||
# and reproducibility; project deps and relative paths won't work.
|
||||
# Env scrubbing (strips *_API_KEY, *_TOKEN, *_SECRET, ...) and the
|
||||
# tool whitelist apply identically in both modes.
|
||||
"mode": "project",
|
||||
},
|
||||
|
||||
# Logging — controls file logging to ~/.hermes/logs/.
|
||||
# agent.log captures INFO+ (all agent activity); errors.log captures WARNING+.
|
||||
"logging": {
|
||||
|
|
@ -788,7 +802,7 @@ DEFAULT_CONFIG = {
|
|||
},
|
||||
|
||||
# Config schema version - bump this when adding new required fields
|
||||
"_config_version": 18,
|
||||
"_config_version": 19,
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
|
|
|
|||
|
|
@ -274,9 +274,9 @@ def get_tool_definitions(
|
|||
# execute_code" even when the API key isn't configured or the toolset is
|
||||
# disabled (#560-discord).
|
||||
if "execute_code" in available_tool_names:
|
||||
from tools.code_execution_tool import SANDBOX_ALLOWED_TOOLS, build_execute_code_schema
|
||||
from tools.code_execution_tool import SANDBOX_ALLOWED_TOOLS, build_execute_code_schema, _get_execution_mode
|
||||
sandbox_enabled = SANDBOX_ALLOWED_TOOLS & available_tool_names
|
||||
dynamic_schema = build_execute_code_schema(sandbox_enabled)
|
||||
dynamic_schema = build_execute_code_schema(sandbox_enabled, mode=_get_execution_mode())
|
||||
for i, td in enumerate(filtered_tools):
|
||||
if td.get("function", {}).get("name") == "execute_code":
|
||||
filtered_tools[i] = {"type": "function", "function": dynamic_schema}
|
||||
|
|
|
|||
|
|
@ -459,7 +459,7 @@ class TestCustomProviderCompatibility:
|
|||
migrate_config(interactive=False, quiet=True)
|
||||
raw = yaml.safe_load(config_path.read_text(encoding="utf-8"))
|
||||
|
||||
assert raw["_config_version"] == 18
|
||||
assert raw["_config_version"] == 19
|
||||
assert raw["providers"]["openai-direct"] == {
|
||||
"api": "https://api.openai.com/v1",
|
||||
"api_key": "test-key",
|
||||
|
|
@ -606,7 +606,7 @@ class TestInterimAssistantMessageConfig:
|
|||
migrate_config(interactive=False, quiet=True)
|
||||
raw = yaml.safe_load(config_path.read_text(encoding="utf-8"))
|
||||
|
||||
assert raw["_config_version"] == 18
|
||||
assert raw["_config_version"] == 19
|
||||
assert raw["display"]["tool_progress"] == "off"
|
||||
assert raw["display"]["interim_assistant_messages"] is True
|
||||
|
||||
|
|
@ -626,6 +626,6 @@ class TestDiscordChannelPromptsConfig:
|
|||
migrate_config(interactive=False, quiet=True)
|
||||
raw = yaml.safe_load(config_path.read_text(encoding="utf-8"))
|
||||
|
||||
assert raw["_config_version"] == 18
|
||||
assert raw["_config_version"] == 19
|
||||
assert raw["discord"]["auto_thread"] is True
|
||||
assert raw["discord"]["channel_prompts"] == {}
|
||||
|
|
|
|||
455
tests/tools/test_code_execution_modes.py
Normal file
455
tests/tools/test_code_execution_modes.py
Normal file
|
|
@ -0,0 +1,455 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Tests for execute_code's strict / project execution modes.
|
||||
|
||||
The mode switch controls two things:
|
||||
- working directory: staging tmpdir (strict) vs session CWD (project)
|
||||
- interpreter: sys.executable (strict) vs active venv's python (project)
|
||||
|
||||
Security-critical invariants — env scrubbing, tool whitelist, resource caps —
|
||||
must apply identically in both modes. These tests guard all three layers.
|
||||
|
||||
Mode is sourced exclusively from ``code_execution.mode`` in config.yaml —
|
||||
there is no env-var override. Tests patch ``_load_config`` directly.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
from contextlib import contextmanager
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
os.environ["TERMINAL_ENV"] = "local"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _force_local_terminal(monkeypatch):
|
||||
"""Mirror test_code_execution.py — guarantee local backend under xdist."""
|
||||
monkeypatch.setenv("TERMINAL_ENV", "local")
|
||||
|
||||
|
||||
from tools.code_execution_tool import (
|
||||
SANDBOX_ALLOWED_TOOLS,
|
||||
DEFAULT_EXECUTION_MODE,
|
||||
EXECUTION_MODES,
|
||||
_get_execution_mode,
|
||||
_is_usable_python,
|
||||
_resolve_child_cwd,
|
||||
_resolve_child_python,
|
||||
build_execute_code_schema,
|
||||
execute_code,
|
||||
)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _mock_mode(mode):
|
||||
"""Context manager that pins code_execution.mode to the given value."""
|
||||
with patch("tools.code_execution_tool._load_config",
|
||||
return_value={"mode": mode}):
|
||||
yield
|
||||
|
||||
|
||||
def _mock_handle_function_call(function_name, function_args, task_id=None, user_task=None):
|
||||
"""Minimal mock dispatcher reused across tests."""
|
||||
if function_name == "terminal":
|
||||
return json.dumps({"output": "mock", "exit_code": 0})
|
||||
if function_name == "read_file":
|
||||
return json.dumps({"content": "line1\n", "total_lines": 1})
|
||||
return json.dumps({"error": f"Unknown tool: {function_name}"})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mode resolution
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestGetExecutionMode(unittest.TestCase):
|
||||
"""_get_execution_mode reads config.yaml only (no env var surface)."""
|
||||
|
||||
def test_default_is_project(self):
|
||||
self.assertEqual(DEFAULT_EXECUTION_MODE, "project")
|
||||
|
||||
def test_config_project(self):
|
||||
with patch("tools.code_execution_tool._load_config",
|
||||
return_value={"mode": "project"}):
|
||||
self.assertEqual(_get_execution_mode(), "project")
|
||||
|
||||
def test_config_strict(self):
|
||||
with patch("tools.code_execution_tool._load_config",
|
||||
return_value={"mode": "strict"}):
|
||||
self.assertEqual(_get_execution_mode(), "strict")
|
||||
|
||||
def test_config_case_insensitive(self):
|
||||
with patch("tools.code_execution_tool._load_config",
|
||||
return_value={"mode": "STRICT"}):
|
||||
self.assertEqual(_get_execution_mode(), "strict")
|
||||
|
||||
def test_config_strips_whitespace(self):
|
||||
with patch("tools.code_execution_tool._load_config",
|
||||
return_value={"mode": " project "}):
|
||||
self.assertEqual(_get_execution_mode(), "project")
|
||||
|
||||
def test_empty_config_falls_back_to_default(self):
|
||||
with patch("tools.code_execution_tool._load_config", return_value={}):
|
||||
self.assertEqual(_get_execution_mode(), DEFAULT_EXECUTION_MODE)
|
||||
|
||||
def test_bogus_config_falls_back_to_default(self):
|
||||
with patch("tools.code_execution_tool._load_config",
|
||||
return_value={"mode": "banana"}):
|
||||
self.assertEqual(_get_execution_mode(), DEFAULT_EXECUTION_MODE)
|
||||
|
||||
def test_none_config_falls_back_to_default(self):
|
||||
with patch("tools.code_execution_tool._load_config",
|
||||
return_value={"mode": None}):
|
||||
# str(None).lower() = "none" → not in EXECUTION_MODES → default
|
||||
self.assertEqual(_get_execution_mode(), DEFAULT_EXECUTION_MODE)
|
||||
|
||||
def test_execution_modes_tuple(self):
|
||||
"""Canonical set of modes — tests + config layer rely on this shape."""
|
||||
self.assertEqual(set(EXECUTION_MODES), {"project", "strict"})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Interpreter resolver
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestResolveChildPython(unittest.TestCase):
|
||||
"""_resolve_child_python — picks the right interpreter per mode."""
|
||||
|
||||
def test_strict_always_sys_executable(self):
|
||||
"""Strict mode never leaves sys.executable, even if venv is set."""
|
||||
with patch.dict(os.environ, {"VIRTUAL_ENV": "/some/venv"}):
|
||||
self.assertEqual(_resolve_child_python("strict"), sys.executable)
|
||||
|
||||
def test_project_with_no_venv_falls_back(self):
|
||||
"""Project mode without VIRTUAL_ENV or CONDA_PREFIX → sys.executable."""
|
||||
env = {k: v for k, v in os.environ.items()
|
||||
if k not in ("VIRTUAL_ENV", "CONDA_PREFIX")}
|
||||
with patch.dict(os.environ, env, clear=True):
|
||||
self.assertEqual(_resolve_child_python("project"), sys.executable)
|
||||
|
||||
def test_project_with_virtualenv_picks_venv_python(self):
|
||||
"""Project mode + VIRTUAL_ENV pointing at a real venv → that python."""
|
||||
import tempfile, pathlib
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
fake_venv = pathlib.Path(td)
|
||||
(fake_venv / "bin").mkdir()
|
||||
# Symlink to real python so the version check actually passes
|
||||
(fake_venv / "bin" / "python").symlink_to(sys.executable)
|
||||
with patch.dict(os.environ, {"VIRTUAL_ENV": str(fake_venv)}):
|
||||
# Clear cache — _is_usable_python memoizes on path
|
||||
_is_usable_python.cache_clear()
|
||||
result = _resolve_child_python("project")
|
||||
self.assertEqual(result, str(fake_venv / "bin" / "python"))
|
||||
|
||||
def test_project_with_broken_venv_falls_back(self):
|
||||
"""VIRTUAL_ENV set but bin/python missing → sys.executable."""
|
||||
import tempfile
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
# No bin/python inside — broken venv
|
||||
with patch.dict(os.environ, {"VIRTUAL_ENV": td}):
|
||||
_is_usable_python.cache_clear()
|
||||
self.assertEqual(_resolve_child_python("project"), sys.executable)
|
||||
|
||||
def test_project_prefers_virtualenv_over_conda(self):
|
||||
"""If both VIRTUAL_ENV and CONDA_PREFIX are set, VIRTUAL_ENV wins."""
|
||||
import tempfile, pathlib
|
||||
with tempfile.TemporaryDirectory() as ve_td, tempfile.TemporaryDirectory() as conda_td:
|
||||
ve = pathlib.Path(ve_td)
|
||||
(ve / "bin").mkdir()
|
||||
(ve / "bin" / "python").symlink_to(sys.executable)
|
||||
|
||||
conda = pathlib.Path(conda_td)
|
||||
(conda / "bin").mkdir()
|
||||
(conda / "bin" / "python").symlink_to(sys.executable)
|
||||
|
||||
with patch.dict(os.environ, {"VIRTUAL_ENV": str(ve), "CONDA_PREFIX": str(conda)}):
|
||||
_is_usable_python.cache_clear()
|
||||
result = _resolve_child_python("project")
|
||||
self.assertEqual(result, str(ve / "bin" / "python"))
|
||||
|
||||
def test_is_usable_python_rejects_nonexistent(self):
|
||||
_is_usable_python.cache_clear()
|
||||
self.assertFalse(_is_usable_python("/does/not/exist/python"))
|
||||
|
||||
def test_is_usable_python_accepts_real_python(self):
|
||||
_is_usable_python.cache_clear()
|
||||
self.assertTrue(_is_usable_python(sys.executable))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CWD resolver
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestResolveChildCwd(unittest.TestCase):
|
||||
|
||||
def test_strict_uses_staging_dir(self):
|
||||
self.assertEqual(_resolve_child_cwd("strict", "/tmp/staging"), "/tmp/staging")
|
||||
|
||||
def test_project_without_terminal_cwd_uses_getcwd(self):
|
||||
env = {k: v for k, v in os.environ.items() if k != "TERMINAL_CWD"}
|
||||
with patch.dict(os.environ, env, clear=True):
|
||||
self.assertEqual(_resolve_child_cwd("project", "/tmp/staging"), os.getcwd())
|
||||
|
||||
def test_project_uses_terminal_cwd_when_set(self):
|
||||
import tempfile
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
with patch.dict(os.environ, {"TERMINAL_CWD": td}):
|
||||
self.assertEqual(_resolve_child_cwd("project", "/tmp/staging"), td)
|
||||
|
||||
def test_project_bogus_terminal_cwd_falls_back_to_getcwd(self):
|
||||
with patch.dict(os.environ, {"TERMINAL_CWD": "/does/not/exist/anywhere"}):
|
||||
self.assertEqual(_resolve_child_cwd("project", "/tmp/staging"), os.getcwd())
|
||||
|
||||
def test_project_expands_tilde(self):
|
||||
import pathlib
|
||||
home = str(pathlib.Path.home())
|
||||
with patch.dict(os.environ, {"TERMINAL_CWD": "~"}):
|
||||
self.assertEqual(_resolve_child_cwd("project", "/tmp/staging"), home)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Schema description
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestModeAwareSchema(unittest.TestCase):
|
||||
|
||||
def test_strict_description_mentions_temp_dir(self):
|
||||
desc = build_execute_code_schema(mode="strict")["description"]
|
||||
self.assertIn("temp dir", desc)
|
||||
|
||||
def test_project_description_mentions_session_and_venv(self):
|
||||
desc = build_execute_code_schema(mode="project")["description"]
|
||||
self.assertIn("session", desc)
|
||||
self.assertIn("venv", desc)
|
||||
|
||||
def test_neither_description_uses_sandbox_language(self):
|
||||
"""REGRESSION GUARD for commit 39b83f34.
|
||||
|
||||
Agents on local backends falsely believed they were sandboxed and
|
||||
refused networking tasks. Do not reintroduce any 'sandbox' /
|
||||
'isolated' / 'cloud' language in the tool description.
|
||||
"""
|
||||
for mode in EXECUTION_MODES:
|
||||
desc = build_execute_code_schema(mode=mode)["description"].lower()
|
||||
for forbidden in ("sandbox", "isolated", "cloud"):
|
||||
self.assertNotIn(forbidden, desc,
|
||||
f"mode={mode}: '{forbidden}' leaked into description")
|
||||
|
||||
def test_descriptions_are_similar_length(self):
|
||||
"""Both modes should have roughly the same-size description."""
|
||||
strict = len(build_execute_code_schema(mode="strict")["description"])
|
||||
project = len(build_execute_code_schema(mode="project")["description"])
|
||||
self.assertLess(abs(strict - project), 200)
|
||||
|
||||
def test_default_mode_reads_config(self):
|
||||
"""build_execute_code_schema() with mode=None reads config.yaml."""
|
||||
with _mock_mode("strict"):
|
||||
desc = build_execute_code_schema()["description"]
|
||||
self.assertIn("temp dir", desc)
|
||||
with _mock_mode("project"):
|
||||
desc = build_execute_code_schema()["description"]
|
||||
self.assertIn("session", desc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Integration: what actually happens when execute_code runs per mode
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="execute_code is POSIX-only")
|
||||
class TestExecuteCodeModeIntegration(unittest.TestCase):
|
||||
"""End-to-end: verify the subprocess actually runs where we expect."""
|
||||
|
||||
def _run(self, code, mode, enabled_tools=None, extra_env=None):
|
||||
env_overrides = extra_env or {}
|
||||
with _mock_mode(mode):
|
||||
with patch.dict(os.environ, env_overrides):
|
||||
with patch("model_tools.handle_function_call",
|
||||
side_effect=_mock_handle_function_call):
|
||||
raw = execute_code(
|
||||
code=code,
|
||||
task_id=f"test-{mode}",
|
||||
enabled_tools=enabled_tools or list(SANDBOX_ALLOWED_TOOLS),
|
||||
)
|
||||
return json.loads(raw)
|
||||
|
||||
def test_strict_mode_runs_in_tmpdir(self):
|
||||
"""Strict mode: script's os.getcwd() is the staging tmpdir."""
|
||||
result = self._run("import os; print(os.getcwd())", mode="strict")
|
||||
self.assertEqual(result["status"], "success")
|
||||
self.assertIn("hermes_sandbox_", result["output"])
|
||||
|
||||
def test_project_mode_runs_in_session_cwd(self):
|
||||
"""Project mode: script's os.getcwd() is the session's working dir."""
|
||||
import tempfile
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
result = self._run(
|
||||
"import os; print(os.getcwd())",
|
||||
mode="project",
|
||||
extra_env={"TERMINAL_CWD": td},
|
||||
)
|
||||
self.assertEqual(result["status"], "success")
|
||||
# Resolve symlinks (macOS /tmp → /private/tmp) on both sides
|
||||
self.assertEqual(
|
||||
os.path.realpath(result["output"].strip()),
|
||||
os.path.realpath(td),
|
||||
)
|
||||
|
||||
def test_project_mode_interpreter_is_venv_python(self):
|
||||
"""Project mode: sys.executable inside the child is the venv's python
|
||||
when VIRTUAL_ENV is set to a real venv."""
|
||||
# The hermes-agent venv is always active during tests, so this also
|
||||
# happens to equal sys.executable of the parent. What we're asserting
|
||||
# is: resolver picked a venv-bin/python path, not that it differs
|
||||
# from sys.executable.
|
||||
result = self._run("import sys; print(sys.executable)", mode="project")
|
||||
self.assertEqual(result["status"], "success")
|
||||
# Either VIRTUAL_ENV-bin/python or sys.executable fallback, both OK.
|
||||
output = result["output"].strip()
|
||||
ve = os.environ.get("VIRTUAL_ENV", "").strip()
|
||||
if ve:
|
||||
self.assertTrue(
|
||||
output.startswith(ve) or output == sys.executable,
|
||||
f"project-mode python should be under VIRTUAL_ENV={ve} or sys.executable={sys.executable}, got {output}",
|
||||
)
|
||||
|
||||
def test_project_mode_can_still_import_hermes_tools(self):
|
||||
"""Regression: hermes_tools still importable from non-tmpdir CWD.
|
||||
|
||||
This is the PYTHONPATH fix — without it, switching to session CWD
|
||||
breaks `from hermes_tools import terminal`.
|
||||
"""
|
||||
import tempfile
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
code = (
|
||||
"from hermes_tools import terminal\n"
|
||||
"r = terminal('echo x')\n"
|
||||
"print(r.get('output', 'MISSING'))\n"
|
||||
)
|
||||
result = self._run(code, mode="project", extra_env={"TERMINAL_CWD": td})
|
||||
self.assertEqual(result["status"], "success")
|
||||
self.assertIn("mock", result["output"])
|
||||
|
||||
def test_strict_mode_can_still_import_hermes_tools(self):
|
||||
"""Regression: strict mode's tmpdir CWD still works for imports."""
|
||||
code = (
|
||||
"from hermes_tools import terminal\n"
|
||||
"r = terminal('echo x')\n"
|
||||
"print(r.get('output', 'MISSING'))\n"
|
||||
)
|
||||
result = self._run(code, mode="strict")
|
||||
self.assertEqual(result["status"], "success")
|
||||
self.assertIn("mock", result["output"])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SECURITY-CRITICAL regression guards
|
||||
#
|
||||
# These MUST pass in both strict and project mode. The whole tiered-mode
|
||||
# proposition rests on the claim that switching from strict to project only
|
||||
# changes CWD + interpreter, not the security posture.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="execute_code is POSIX-only")
|
||||
class TestSecurityInvariantsAcrossModes(unittest.TestCase):
|
||||
|
||||
def _run(self, code, mode):
|
||||
with _mock_mode(mode):
|
||||
with patch("model_tools.handle_function_call",
|
||||
side_effect=_mock_handle_function_call):
|
||||
raw = execute_code(
|
||||
code=code,
|
||||
task_id=f"test-sec-{mode}",
|
||||
enabled_tools=list(SANDBOX_ALLOWED_TOOLS),
|
||||
)
|
||||
return json.loads(raw)
|
||||
|
||||
def test_api_keys_scrubbed_in_strict_mode(self):
|
||||
code = (
|
||||
"import os\n"
|
||||
"print('KEY=' + os.environ.get('OPENAI_API_KEY', 'MISSING'))\n"
|
||||
"print('TOK=' + os.environ.get('ANTHROPIC_API_KEY', 'MISSING'))\n"
|
||||
)
|
||||
with patch.dict(os.environ, {
|
||||
"OPENAI_API_KEY": "sk-should-not-leak",
|
||||
"ANTHROPIC_API_KEY": "ant-should-not-leak",
|
||||
}):
|
||||
result = self._run(code, mode="strict")
|
||||
self.assertEqual(result["status"], "success")
|
||||
self.assertIn("KEY=MISSING", result["output"])
|
||||
self.assertIn("TOK=MISSING", result["output"])
|
||||
self.assertNotIn("sk-should-not-leak", result["output"])
|
||||
self.assertNotIn("ant-should-not-leak", result["output"])
|
||||
|
||||
def test_api_keys_scrubbed_in_project_mode(self):
|
||||
"""CRITICAL: the project-mode default does NOT leak user credentials."""
|
||||
code = (
|
||||
"import os\n"
|
||||
"print('KEY=' + os.environ.get('OPENAI_API_KEY', 'MISSING'))\n"
|
||||
"print('TOK=' + os.environ.get('ANTHROPIC_API_KEY', 'MISSING'))\n"
|
||||
"print('SEC=' + os.environ.get('GITHUB_TOKEN', 'MISSING'))\n"
|
||||
)
|
||||
with patch.dict(os.environ, {
|
||||
"OPENAI_API_KEY": "sk-should-not-leak",
|
||||
"ANTHROPIC_API_KEY": "ant-should-not-leak",
|
||||
"GITHUB_TOKEN": "ghp-should-not-leak",
|
||||
}):
|
||||
result = self._run(code, mode="project")
|
||||
self.assertEqual(result["status"], "success")
|
||||
for needle in ("KEY=MISSING", "TOK=MISSING", "SEC=MISSING"):
|
||||
self.assertIn(needle, result["output"])
|
||||
for leaked in ("sk-should-not-leak", "ant-should-not-leak", "ghp-should-not-leak"):
|
||||
self.assertNotIn(leaked, result["output"])
|
||||
|
||||
def test_secret_substrings_scrubbed_in_project_mode(self):
|
||||
"""SECRET/PASSWORD/CREDENTIAL/PASSWD/AUTH filters still apply."""
|
||||
code = (
|
||||
"import os\n"
|
||||
"for k in ('MY_SECRET', 'DB_PASSWORD', 'VAULT_CREDENTIAL', "
|
||||
"'LDAP_PASSWD', 'AUTH_TOKEN'):\n"
|
||||
" print(f'{k}=' + os.environ.get(k, 'MISSING'))\n"
|
||||
)
|
||||
with patch.dict(os.environ, {
|
||||
"MY_SECRET": "secret-should-not-leak",
|
||||
"DB_PASSWORD": "password-should-not-leak",
|
||||
"VAULT_CREDENTIAL": "cred-should-not-leak",
|
||||
"LDAP_PASSWD": "passwd-should-not-leak",
|
||||
"AUTH_TOKEN": "auth-should-not-leak",
|
||||
}):
|
||||
result = self._run(code, mode="project")
|
||||
self.assertEqual(result["status"], "success")
|
||||
for leaked in ("secret-should-not-leak", "password-should-not-leak",
|
||||
"cred-should-not-leak", "passwd-should-not-leak",
|
||||
"auth-should-not-leak"):
|
||||
self.assertNotIn(leaked, result["output"])
|
||||
|
||||
def test_tool_whitelist_enforced_in_strict_mode(self):
|
||||
"""A script cannot RPC-call tools outside SANDBOX_ALLOWED_TOOLS."""
|
||||
# execute_code is NOT in SANDBOX_ALLOWED_TOOLS (no recursion)
|
||||
self.assertNotIn("execute_code", SANDBOX_ALLOWED_TOOLS)
|
||||
code = (
|
||||
"import hermes_tools as ht\n"
|
||||
"print('execute_code_available:', hasattr(ht, 'execute_code'))\n"
|
||||
"print('delegate_task_available:', hasattr(ht, 'delegate_task'))\n"
|
||||
)
|
||||
result = self._run(code, mode="strict")
|
||||
self.assertEqual(result["status"], "success")
|
||||
self.assertIn("execute_code_available: False", result["output"])
|
||||
self.assertIn("delegate_task_available: False", result["output"])
|
||||
|
||||
def test_tool_whitelist_enforced_in_project_mode(self):
|
||||
"""CRITICAL: project mode does NOT widen the tool whitelist."""
|
||||
code = (
|
||||
"import hermes_tools as ht\n"
|
||||
"print('execute_code_available:', hasattr(ht, 'execute_code'))\n"
|
||||
"print('delegate_task_available:', hasattr(ht, 'delegate_task'))\n"
|
||||
)
|
||||
result = self._run(code, mode="project")
|
||||
self.assertEqual(result["status"], "success")
|
||||
self.assertIn("execute_code_available: False", result["output"])
|
||||
self.assertIn("delegate_task_available: False", result["output"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
@ -29,6 +29,7 @@ Remote execution additionally requires Python 3 in the terminal backend.
|
|||
"""
|
||||
|
||||
import base64
|
||||
import functools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
|
@ -1022,10 +1023,15 @@ def execute_code(
|
|||
child_env["HERMES_RPC_SOCKET"] = sock_path
|
||||
child_env["PYTHONDONTWRITEBYTECODE"] = "1"
|
||||
# Ensure the hermes-agent root is importable in the sandbox so
|
||||
# repo-root modules are available to child scripts.
|
||||
# repo-root modules are available to child scripts. We also prepend
|
||||
# the staging tmpdir so ``from hermes_tools import ...`` resolves even
|
||||
# when the subprocess CWD is not tmpdir (project mode).
|
||||
_hermes_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
_existing_pp = child_env.get("PYTHONPATH", "")
|
||||
child_env["PYTHONPATH"] = _hermes_root + (os.pathsep + _existing_pp if _existing_pp else "")
|
||||
_pp_parts = [tmpdir, _hermes_root]
|
||||
if _existing_pp:
|
||||
_pp_parts.append(_existing_pp)
|
||||
child_env["PYTHONPATH"] = os.pathsep.join(_pp_parts)
|
||||
# Inject user's configured timezone so datetime.now() in sandboxed
|
||||
# code reflects the correct wall-clock time. Only TZ is set —
|
||||
# HERMES_TIMEZONE is an internal Hermes setting and must not leak
|
||||
|
|
@ -1042,9 +1048,19 @@ def execute_code(
|
|||
if _profile_home:
|
||||
child_env["HOME"] = _profile_home
|
||||
|
||||
# Resolve interpreter + CWD based on execute_code mode.
|
||||
# - strict : today's behavior (sys.executable + tmpdir CWD).
|
||||
# - project: user's venv python + session's working directory, so
|
||||
# project deps like pandas and user files resolve.
|
||||
# Env scrubbing and tool whitelist apply identically in both modes.
|
||||
_mode = _get_execution_mode()
|
||||
_child_python = _resolve_child_python(_mode)
|
||||
_child_cwd = _resolve_child_cwd(_mode, tmpdir)
|
||||
_script_path = os.path.join(tmpdir, "script.py")
|
||||
|
||||
proc = subprocess.Popen(
|
||||
[sys.executable, "script.py"],
|
||||
cwd=tmpdir,
|
||||
[_child_python, _script_path],
|
||||
cwd=_child_cwd,
|
||||
env=child_env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
|
|
@ -1299,6 +1315,127 @@ def _load_config() -> dict:
|
|||
return {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Execution mode resolution (strict vs project)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Valid values for code_execution.mode. Kept as a module constant so tests
|
||||
# and the config layer can reference the canonical set.
|
||||
EXECUTION_MODES = ("project", "strict")
|
||||
DEFAULT_EXECUTION_MODE = "project"
|
||||
|
||||
|
||||
def _get_execution_mode() -> str:
|
||||
"""Return the active execute_code mode — 'project' or 'strict'.
|
||||
|
||||
Reads ``code_execution.mode`` from config.yaml; invalid values fall back
|
||||
to ``DEFAULT_EXECUTION_MODE`` ('project') with a log warning.
|
||||
|
||||
Mode semantics:
|
||||
- ``project`` (default): scripts run in the session's working directory
|
||||
with the active virtual environment's python, so project dependencies
|
||||
(pandas, torch, project packages) and files resolve naturally.
|
||||
- ``strict``: scripts run in an isolated temp directory with
|
||||
``sys.executable`` (hermes-agent's python). Reproducible and the
|
||||
interpreter is guaranteed to work, but project deps and relative paths
|
||||
won't resolve.
|
||||
|
||||
Env scrubbing and tool whitelist apply identically in both modes.
|
||||
"""
|
||||
cfg_value = str(_load_config().get("mode", DEFAULT_EXECUTION_MODE)).strip().lower()
|
||||
if cfg_value in EXECUTION_MODES:
|
||||
return cfg_value
|
||||
logger.warning(
|
||||
"Ignoring code_execution.mode=%r (expected one of %s), falling back to %r",
|
||||
cfg_value, EXECUTION_MODES, DEFAULT_EXECUTION_MODE,
|
||||
)
|
||||
return DEFAULT_EXECUTION_MODE
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=32)
|
||||
def _is_usable_python(python_path: str) -> bool:
|
||||
"""Check whether a candidate Python interpreter is usable for execute_code.
|
||||
|
||||
Requires Python 3.8+ (f-strings and stdlib modules the RPC stubs need).
|
||||
Cached so we don't fork a subprocess on every execute_code call.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[python_path, "-c",
|
||||
"import sys; sys.exit(0 if sys.version_info >= (3, 8) else 1)"],
|
||||
timeout=5,
|
||||
capture_output=True,
|
||||
)
|
||||
return result.returncode == 0
|
||||
except (OSError, subprocess.TimeoutExpired, subprocess.SubprocessError):
|
||||
return False
|
||||
|
||||
|
||||
def _resolve_child_python(mode: str) -> str:
|
||||
"""Pick the Python interpreter for the execute_code subprocess.
|
||||
|
||||
In ``strict`` mode, always ``sys.executable`` — guaranteed to work and
|
||||
keeps behavior fully reproducible across sessions.
|
||||
|
||||
In ``project`` mode, prefer the user's active virtualenv/conda env's
|
||||
python so ``import pandas`` etc. work. Falls back to ``sys.executable``
|
||||
if no venv is detected, the candidate binary is missing/not executable,
|
||||
or it fails a Python 3.8+ version check.
|
||||
"""
|
||||
if mode != "project":
|
||||
return sys.executable
|
||||
|
||||
if _IS_WINDOWS:
|
||||
exe_names = ("python.exe", "python3.exe")
|
||||
subdirs = ("Scripts",)
|
||||
else:
|
||||
exe_names = ("python", "python3")
|
||||
subdirs = ("bin",)
|
||||
|
||||
for var in ("VIRTUAL_ENV", "CONDA_PREFIX"):
|
||||
root = os.environ.get(var, "").strip()
|
||||
if not root:
|
||||
continue
|
||||
for subdir in subdirs:
|
||||
for exe in exe_names:
|
||||
candidate = os.path.join(root, subdir, exe)
|
||||
if not (os.path.isfile(candidate) and os.access(candidate, os.X_OK)):
|
||||
continue
|
||||
if _is_usable_python(candidate):
|
||||
return candidate
|
||||
# Found the interpreter but it failed the version check —
|
||||
# log once and fall through to sys.executable.
|
||||
logger.info(
|
||||
"execute_code: skipping %s=%s (Python version < 3.8 or broken). "
|
||||
"Using sys.executable instead.", var, candidate,
|
||||
)
|
||||
return sys.executable
|
||||
|
||||
return sys.executable
|
||||
|
||||
|
||||
def _resolve_child_cwd(mode: str, staging_dir: str) -> str:
|
||||
"""Resolve the working directory for the execute_code subprocess.
|
||||
|
||||
- ``strict``: the staging tmpdir (today's behavior).
|
||||
- ``project``: the session's TERMINAL_CWD (same as the terminal tool), or
|
||||
``os.getcwd()`` if TERMINAL_CWD is unset or doesn't point at a real dir.
|
||||
Falls back to the staging tmpdir as a last resort so we never invoke
|
||||
Popen with a nonexistent cwd.
|
||||
"""
|
||||
if mode != "project":
|
||||
return staging_dir
|
||||
raw = os.environ.get("TERMINAL_CWD", "").strip()
|
||||
if raw:
|
||||
expanded = os.path.expanduser(raw)
|
||||
if os.path.isdir(expanded):
|
||||
return expanded
|
||||
here = os.getcwd()
|
||||
if os.path.isdir(here):
|
||||
return here
|
||||
return staging_dir
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OpenAI Function-Calling Schema
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -1330,15 +1467,24 @@ _TOOL_DOC_LINES = [
|
|||
]
|
||||
|
||||
|
||||
def build_execute_code_schema(enabled_sandbox_tools: set = None) -> dict:
|
||||
def build_execute_code_schema(enabled_sandbox_tools: set = None,
|
||||
mode: str = None) -> dict:
|
||||
"""Build the execute_code schema with description listing only enabled tools.
|
||||
|
||||
When tools are disabled via ``hermes tools`` (e.g. web is turned off),
|
||||
the schema description should NOT mention web_search / web_extract —
|
||||
otherwise the model thinks they are available and keeps trying to use them.
|
||||
|
||||
``mode`` controls the working-directory sentence in the description:
|
||||
- ``'strict'``: scripts run in a temp dir (not the session's CWD)
|
||||
- ``'project'`` (default): scripts run in the session's CWD with the
|
||||
active venv's python
|
||||
If ``mode`` is None, the current ``code_execution.mode`` config is read.
|
||||
"""
|
||||
if enabled_sandbox_tools is None:
|
||||
enabled_sandbox_tools = SANDBOX_ALLOWED_TOOLS
|
||||
if mode is None:
|
||||
mode = _get_execution_mode()
|
||||
|
||||
# Build tool documentation lines for only the enabled tools
|
||||
tool_lines = "\n".join(
|
||||
|
|
@ -1354,6 +1500,20 @@ def build_execute_code_schema(enabled_sandbox_tools: set = None) -> dict:
|
|||
else:
|
||||
import_str = "..."
|
||||
|
||||
# Mode-specific CWD guidance. Project mode is the default and matches
|
||||
# terminal()'s filesystem/interpreter; strict mode retains the isolated
|
||||
# temp-dir staging and hermes-agent's own python.
|
||||
if mode == "strict":
|
||||
cwd_note = (
|
||||
"Scripts run in their own temp dir, not the session's CWD — use absolute paths "
|
||||
"(os.path.expanduser('~/.hermes/.env')) or terminal()/read_file() for user files."
|
||||
)
|
||||
else:
|
||||
cwd_note = (
|
||||
"Scripts run in the session's working directory with the active venv's python, "
|
||||
"so project deps (pandas, etc.) and relative paths work like in terminal()."
|
||||
)
|
||||
|
||||
description = (
|
||||
"Run a Python script that can call Hermes tools programmatically. "
|
||||
"Use this when you need 3+ tool calls with processing logic between them, "
|
||||
|
|
@ -1367,8 +1527,7 @@ def build_execute_code_schema(enabled_sandbox_tools: set = None) -> dict:
|
|||
f"{tool_lines}\n\n"
|
||||
"Limits: 5-minute timeout, 50KB stdout cap, max 50 tool calls per script. "
|
||||
"terminal() is foreground-only (no background or pty).\n\n"
|
||||
"Scripts run in their own temp dir, not the session's CWD — use absolute paths "
|
||||
"(os.path.expanduser('~/.hermes/.env')) or terminal()/read_file() for user files.\n\n"
|
||||
f"{cwd_note}\n\n"
|
||||
"Print your final result to stdout. Use Python stdlib (json, re, math, csv, "
|
||||
"datetime, collections, etc.) for processing between tool calls.\n\n"
|
||||
"Also available (no import needed — built into hermes_tools):\n"
|
||||
|
|
@ -1397,7 +1556,8 @@ def build_execute_code_schema(enabled_sandbox_tools: set = None) -> dict:
|
|||
}
|
||||
|
||||
|
||||
# Default schema used at registration time (all sandbox tools listed)
|
||||
# Default schema used at registration time (all sandbox tools listed,
|
||||
# current configured mode). model_tools.py rebuilds per-session anyway.
|
||||
EXECUTE_CODE_SCHEMA = build_execute_code_schema()
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue