diff --git a/tests/tools/test_terminal_exit_semantics.py b/tests/tools/test_terminal_exit_semantics.py new file mode 100644 index 0000000000..f375f6f2e1 --- /dev/null +++ b/tests/tools/test_terminal_exit_semantics.py @@ -0,0 +1,152 @@ +"""Tests for terminal command exit code semantic interpretation.""" + +import pytest + +from tools.terminal_tool import _interpret_exit_code + + +class TestInterpretExitCode: + """Test _interpret_exit_code returns correct notes for known command semantics.""" + + # ---- exit code 0 always returns None ---- + + def test_success_returns_none(self): + assert _interpret_exit_code("grep foo bar", 0) is None + assert _interpret_exit_code("diff a b", 0) is None + assert _interpret_exit_code("test -f /etc/passwd", 0) is None + + # ---- grep / rg family: exit 1 = no matches ---- + + @pytest.mark.parametrize("cmd", [ + "grep 'pattern' file.txt", + "egrep 'pattern' file.txt", + "fgrep 'pattern' file.txt", + "rg 'foo' .", + "ag 'foo' .", + "ack 'foo' .", + ]) + def test_grep_family_no_matches(self, cmd): + result = _interpret_exit_code(cmd, 1) + assert result is not None + assert "no matches" in result.lower() + + def test_grep_real_error_no_note(self): + """grep exit 2+ is a real error — should return None.""" + assert _interpret_exit_code("grep 'foo' bar", 2) is None + assert _interpret_exit_code("rg 'foo' .", 2) is None + + # ---- diff: exit 1 = files differ ---- + + def test_diff_files_differ(self): + result = _interpret_exit_code("diff file1 file2", 1) + assert result is not None + assert "differ" in result.lower() + + def test_colordiff_files_differ(self): + result = _interpret_exit_code("colordiff file1 file2", 1) + assert result is not None + assert "differ" in result.lower() + + def test_diff_real_error_no_note(self): + assert _interpret_exit_code("diff a b", 2) is None + + # ---- test / [: exit 1 = condition false ---- + + def test_test_condition_false(self): + result = _interpret_exit_code("test -f /nonexistent", 1) + assert result is not None + assert "false" in result.lower() + + def test_bracket_condition_false(self): + result = _interpret_exit_code("[ -f /nonexistent ]", 1) + assert result is not None + assert "false" in result.lower() + + # ---- find: exit 1 = partial success ---- + + def test_find_partial_success(self): + result = _interpret_exit_code("find . -name '*.py'", 1) + assert result is not None + assert "inaccessible" in result.lower() + + # ---- curl: various informational codes ---- + + def test_curl_timeout(self): + result = _interpret_exit_code("curl https://example.com", 28) + assert result is not None + assert "timed out" in result.lower() + + def test_curl_connection_refused(self): + result = _interpret_exit_code("curl http://localhost:99999", 7) + assert result is not None + assert "connect" in result.lower() + + # ---- git: exit 1 is context-dependent ---- + + def test_git_diff_exit_1(self): + result = _interpret_exit_code("git diff HEAD~1", 1) + assert result is not None + assert "normal" in result.lower() + + # ---- pipeline / chain handling ---- + + def test_pipeline_last_command(self): + """In a pipeline, the last command determines the exit code.""" + result = _interpret_exit_code("ls -la | grep 'pattern'", 1) + assert result is not None + assert "no matches" in result.lower() + + def test_and_chain_last_command(self): + result = _interpret_exit_code("cd /tmp && grep foo bar", 1) + assert result is not None + assert "no matches" in result.lower() + + def test_semicolon_chain_last_command(self): + result = _interpret_exit_code("cat file; diff a b", 1) + assert result is not None + assert "differ" in result.lower() + + def test_or_chain_last_command(self): + result = _interpret_exit_code("false || grep foo bar", 1) + assert result is not None + assert "no matches" in result.lower() + + # ---- full paths ---- + + def test_full_path_command(self): + result = _interpret_exit_code("/usr/bin/grep 'foo' bar", 1) + assert result is not None + assert "no matches" in result.lower() + + # ---- env var prefix ---- + + def test_env_var_prefix_stripped(self): + result = _interpret_exit_code("LANG=C grep 'foo' bar", 1) + assert result is not None + assert "no matches" in result.lower() + + def test_multiple_env_vars(self): + result = _interpret_exit_code("FOO=1 BAR=2 grep 'foo' bar", 1) + assert result is not None + assert "no matches" in result.lower() + + # ---- unknown commands return None ---- + + @pytest.mark.parametrize("cmd", [ + "python3 script.py", + "rm -rf /tmp/test", + "npm test", + "make build", + "cargo build", + ]) + def test_unknown_commands_return_none(self, cmd): + assert _interpret_exit_code(cmd, 1) is None + + # ---- edge cases ---- + + def test_empty_command(self): + assert _interpret_exit_code("", 1) is None + + def test_only_env_vars(self): + """Command with only env var assignments, no actual command.""" + assert _interpret_exit_code("FOO=bar", 1) is None diff --git a/tools/terminal_tool.py b/tools/terminal_tool.py index 31d7d77c67..26591ceedb 100644 --- a/tools/terminal_tool.py +++ b/tools/terminal_tool.py @@ -35,6 +35,7 @@ import json import logging import os import platform +import re import time import threading import atexit @@ -899,6 +900,78 @@ def _atexit_cleanup(): atexit.register(_atexit_cleanup) +# ============================================================================= +# Exit Code Context for Common CLI Tools +# ============================================================================= +# Many Unix commands use non-zero exit codes for informational purposes, not +# to indicate failure. The model sees a raw exit_code=1 from `grep` and +# wastes a turn investigating something that just means "no matches". +# This lookup adds a human-readable note so the agent can move on. + +def _interpret_exit_code(command: str, exit_code: int) -> str | None: + """Return a human-readable note when a non-zero exit code is non-erroneous. + + Returns None when the exit code is 0 or genuinely signals an error. + The note is appended to the tool result so the model doesn't waste + turns investigating expected exit codes. + """ + if exit_code == 0: + return None + + # Extract the last command in a pipeline/chain — that determines the + # exit code. Handles `cmd1 && cmd2`, `cmd1 | cmd2`, `cmd1; cmd2`. + # Deliberately simple: split on shell operators and take the last piece. + segments = re.split(r'\s*(?:\|\||&&|[|;])\s*', command) + last_segment = (segments[-1] if segments else command).strip() + + # Get base command name (first word), stripping env var assignments + # like VAR=val cmd ... + words = last_segment.split() + base_cmd = "" + for w in words: + if "=" in w and not w.startswith("-"): + continue # skip VAR=val + base_cmd = w.split("/")[-1] # handle /usr/bin/grep -> grep + break + + if not base_cmd: + return None + + # Command-specific semantics + semantics: dict[str, dict[int, str]] = { + # grep/rg/ag/ack: 1=no matches found (normal), 2+=real error + "grep": {1: "No matches found (not an error)"}, + "egrep": {1: "No matches found (not an error)"}, + "fgrep": {1: "No matches found (not an error)"}, + "rg": {1: "No matches found (not an error)"}, + "ag": {1: "No matches found (not an error)"}, + "ack": {1: "No matches found (not an error)"}, + # diff: 1=files differ (expected), 2+=real error + "diff": {1: "Files differ (expected, not an error)"}, + "colordiff": {1: "Files differ (expected, not an error)"}, + # find: 1=some dirs inaccessible but results may still be valid + "find": {1: "Some directories were inaccessible (partial results may still be valid)"}, + # test/[: 1=condition is false (expected) + "test": {1: "Condition evaluated to false (expected, not an error)"}, + "[": {1: "Condition evaluated to false (expected, not an error)"}, + # curl: common non-error codes + "curl": { + 6: "Could not resolve host", + 7: "Failed to connect to host", + 22: "HTTP response code indicated error (e.g. 404, 500)", + 28: "Operation timed out", + }, + # git: 1 is context-dependent but often normal (e.g. git diff with changes) + "git": {1: "Non-zero exit (often normal — e.g. 'git diff' returns 1 when files differ)"}, + } + + cmd_semantics = semantics.get(base_cmd) + if cmd_semantics and exit_code in cmd_semantics: + return cmd_semantics[exit_code] + + return None + + def terminal_tool( command: str, background: bool = False, @@ -1242,13 +1315,20 @@ def terminal_tool( from agent.redact import redact_sensitive_text output = redact_sensitive_text(output.strip()) if output else "" + # Interpret non-zero exit codes that aren't real errors + # (e.g. grep=1 means "no matches", diff=1 means "files differ") + exit_note = _interpret_exit_code(command, returncode) + result_dict = { "output": output, "exit_code": returncode, - "error": None + "error": None, } if approval_note: result_dict["approval"] = approval_note + if exit_note: + result_dict["exit_code_meaning"] = exit_note + return json.dumps(result_dict, ensure_ascii=False) except Exception as e: