fix(daytona): use shell timeout wrapper instead of broken SDK exec timeout

The Daytona SDK's process.exec(timeout=N) parameter is not enforced —
the server-side timeout never fires and the SDK has no client-side
fallback, causing commands to hang indefinitely.

Fix: wrap commands with timeout N sh -c '...' (coreutils) which
reliably kills the process and returns exit code 124. Added
shlex.quote for proper shell escaping and a secondary deadline (timeout + 10s) that force-stops the sandbox if the shell timeout somehow fails.

Signed-off-by: rovle <lovre.pesut@gmail.com>
This commit is contained in:
rovle 2026-03-05 13:12:41 -08:00
parent 74a36b0729
commit a6499b6107
2 changed files with 58 additions and 3 deletions

View file

@ -200,6 +200,36 @@ class TestExecute:
assert result["output"] == "hello"
assert result["returncode"] == 0
def test_command_wrapped_with_shell_timeout(self, make_env):
sb = _make_sandbox()
sb.process.exec.side_effect = [
_make_exec_response(result="/root"),
_make_exec_response(result="ok", exit_code=0),
]
sb.state = "started"
env = make_env(sandbox=sb, timeout=42)
env.execute("echo hello")
# The command sent to exec should be wrapped with `timeout N sh -c '...'`
call_args = sb.process.exec.call_args_list[-1]
cmd = call_args[0][0]
assert cmd.startswith("timeout 42 sh -c ")
# SDK timeout param should NOT be passed
assert "timeout" not in call_args[1]
def test_timeout_returns_exit_code_124(self, make_env):
"""Shell timeout utility returns exit code 124."""
sb = _make_sandbox()
sb.process.exec.side_effect = [
_make_exec_response(result="/root"),
_make_exec_response(result="", exit_code=124),
]
sb.state = "started"
env = make_env(sandbox=sb)
result = env.execute("sleep 300", timeout=5)
assert result["returncode"] == 124
def test_nonzero_exit_code(self, make_env):
sb = _make_sandbox()
sb.process.exec.side_effect = [
@ -223,10 +253,12 @@ class TestExecute:
env.execute("python3", stdin_data="print('hi')")
# Check that the command passed to exec contains heredoc markers
# (single quotes get shell-escaped by shlex.quote, so check components)
call_args = sb.process.exec.call_args_list[-1]
cmd = call_args[0][0]
assert "HERMES_EOF_" in cmd
assert "print('hi')" in cmd
assert "print" in cmd
assert "hi" in cmd
def test_custom_cwd_passed_through(self, make_env):
sb = _make_sandbox()

View file

@ -7,6 +7,7 @@ and resumed on next creation, preserving the filesystem across sessions.
import logging
import math
import shlex
import threading
import uuid
import warnings
@ -112,13 +113,24 @@ class DaytonaEnvironment(BaseEnvironment):
logger.info("Daytona: restarted sandbox %s", self._sandbox.id)
def _exec_in_thread(self, exec_command: str, cwd: Optional[str], timeout: int) -> dict:
"""Run exec in a background thread with interrupt polling."""
"""Run exec in a background thread with interrupt polling.
The Daytona SDK's exec(timeout=...) parameter is unreliable (the
server-side timeout is not enforced and the SDK has no client-side
fallback), so we wrap the command with the shell ``timeout`` utility
which reliably kills the process and returns exit code 124.
"""
# Wrap with shell `timeout` to enforce the deadline reliably.
# Add a small buffer so the shell timeout fires before any SDK-level
# timeout would, giving us a clean exit code 124.
timed_command = f"timeout {timeout} sh -c {shlex.quote(exec_command)}"
result_holder: dict = {"value": None, "error": None}
def _run():
try:
response = self._sandbox.process.exec(
exec_command, cwd=cwd, timeout=timeout,
timed_command, cwd=cwd,
)
result_holder["value"] = {
"output": response.result or "",
@ -129,8 +141,11 @@ class DaytonaEnvironment(BaseEnvironment):
t = threading.Thread(target=_run, daemon=True)
t.start()
# Wait for timeout + generous buffer for network/SDK overhead
deadline = timeout + 10
while t.is_alive():
t.join(timeout=0.2)
deadline -= 0.2
if is_interrupted():
with self._lock:
try:
@ -141,6 +156,14 @@ class DaytonaEnvironment(BaseEnvironment):
"output": "[Command interrupted - Daytona sandbox stopped]",
"returncode": 130,
}
if deadline <= 0:
# Shell timeout didn't fire and SDK is hung — force stop
with self._lock:
try:
self._sandbox.stop()
except Exception:
pass
return self._timeout_result(timeout)
if result_holder["error"]:
return {"error": result_holder["error"]}